diff --git a/sourcecode-parser/graph/callgraph/builder.go b/sourcecode-parser/graph/callgraph/builder.go index 7c2d4af5..8e03eeef 100644 --- a/sourcecode-parser/graph/callgraph/builder.go +++ b/sourcecode-parser/graph/callgraph/builder.go @@ -1,1101 +1,88 @@ package callgraph import ( - "log" - "os" - "path/filepath" - "strings" - "sync" - sitter "github.com/smacker/go-tree-sitter" "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" - "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/analysis/taint" + cgbuilder "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/builder" "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" - "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/extraction" cgregistry "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/registry" "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/resolution" ) -// ImportMapCache provides thread-safe caching of ImportMap instances. -// This avoids re-parsing imports from the same file multiple times. -// -// The cache uses a read-write mutex to allow concurrent reads while -// ensuring safe writes. This is critical for performance since: -// - Import extraction involves tree-sitter parsing (expensive) -// - Many files may import the same modules -// - Build call graph processes files sequentially (for now) -// -// Example usage: +// ImportMapCache is a type alias for backward compatibility. // -// cache := NewImportMapCache() -// importMap := cache.GetOrExtract(filePath, sourceCode, registry) -type ImportMapCache struct { - cache map[string]*core.ImportMap // Maps file path to ImportMap - mu sync.RWMutex // Protects cache map -} +// Deprecated: Use builder.ImportMapCache instead. +// This type alias will be removed in a future version. +type ImportMapCache = cgbuilder.ImportMapCache // NewImportMapCache creates a new empty import map cache. -func NewImportMapCache() *ImportMapCache { - return &ImportMapCache{ - cache: make(map[string]*core.ImportMap), - } -} - -// Get retrieves an ImportMap from the cache if it exists. -// -// Parameters: -// - filePath: absolute path to the Python file -// -// Returns: -// - ImportMap and true if found in cache, nil and false otherwise -func (c *ImportMapCache) Get(filePath string) (*core.ImportMap, bool) { - c.mu.RLock() - defer c.mu.RUnlock() - - importMap, ok := c.cache[filePath] - return importMap, ok -} - -// Put stores an ImportMap in the cache. -// -// Parameters: -// - filePath: absolute path to the Python file -// - importMap: the extracted ImportMap to cache -func (c *ImportMapCache) Put(filePath string, importMap *core.ImportMap) { - c.mu.Lock() - defer c.mu.Unlock() - - c.cache[filePath] = importMap -} - -// GetOrExtract retrieves an ImportMap from cache or extracts it if not cached. -// This is the main entry point for using the cache. -// -// Parameters: -// - filePath: absolute path to the Python file -// - sourceCode: file contents (only used if extraction needed) -// - registry: module registry for resolving imports // -// Returns: -// - ImportMap from cache or newly extracted -// - error if extraction fails (cache misses only) -// -// Thread-safety: -// - Multiple goroutines can safely call GetOrExtract concurrently -// - First caller for a file will extract and cache -// - Subsequent callers will get cached result -func (c *ImportMapCache) GetOrExtract(filePath string, sourceCode []byte, registry *core.ModuleRegistry) (*core.ImportMap, error) { - // Try to get from cache (fast path with read lock) - if importMap, ok := c.Get(filePath); ok { - return importMap, nil - } - - // Cache miss - extract imports (expensive operation) - importMap, err := resolution.ExtractImports(filePath, sourceCode, registry) - if err != nil { - return nil, err - } - - // Store in cache for future use - c.Put(filePath, importMap) - - return importMap, nil +// Deprecated: Use builder.NewImportMapCache instead. +func NewImportMapCache() *ImportMapCache { + return cgbuilder.NewImportMapCache() } // BuildCallGraph constructs the complete call graph for a Python project. -// This is Pass 3 of the 3-pass algorithm: -// - Pass 1: BuildModuleRegistry - map files to modules -// - Pass 2: ExtractImports + ExtractCallSites - parse imports and calls -// - Pass 3: BuildCallGraph - resolve calls and build graph // -// Algorithm: -// 1. For each Python file in the project: -// a. Extract imports to build ImportMap -// b. Extract call sites from AST -// c. Extract function definitions from main graph -// 2. For each call site: -// a. Resolve target name using ImportMap -// b. Find target function definition in registry -// c. Add edge from caller to callee -// d. Store detailed call site information -// -// Parameters: -// - codeGraph: the existing code graph with parsed AST nodes -// - registry: module registry mapping files to modules -// - projectRoot: absolute path to project root -// -// Returns: -// - CallGraph: complete call graph with edges and call sites -// - error: if any step fails -// -// Example: -// Given: -// File: myapp/views.py -// def get_user(): -// sanitize(data) # call to myapp.utils.sanitize -// -// Creates: -// edges: {"myapp.views.get_user": ["myapp.utils.sanitize"]} -// reverseEdges: {"myapp.utils.sanitize": ["myapp.views.get_user"]} -// callSites: {"myapp.views.get_user": [CallSite{Target: "sanitize", ...}]} +// Deprecated: Use builder.BuildCallGraph instead. func BuildCallGraph(codeGraph *graph.CodeGraph, registry *core.ModuleRegistry, projectRoot string) (*core.CallGraph, error) { - callGraph := core.NewCallGraph() - - // Initialize import map cache for performance - // This avoids re-parsing imports from the same file multiple times - importCache := NewImportMapCache() - - // Initialize type inference engine - typeEngine := resolution.NewTypeInferenceEngine(registry) - typeEngine.Builtins = cgregistry.NewBuiltinRegistry() - - // Phase 3 Task 12: Initialize attribute registry for tracking class attributes - typeEngine.Attributes = cgregistry.NewAttributeRegistry() - - // PR #3: Detect Python version and load stdlib registry from remote CDN - pythonVersion := detectPythonVersion(projectRoot) - log.Printf("Detected Python version: %s", pythonVersion) - - // Create remote registry loader - remoteLoader := cgregistry.NewStdlibRegistryRemote( - "https://codepathfinder.dev/assets/registries", - pythonVersion, - ) - - // Load manifest from CDN - err := remoteLoader.LoadManifest() - if err != nil { - log.Printf("Warning: Failed to load stdlib registry from CDN: %v", err) - // Continue without stdlib resolution - not a fatal error - } else { - // Create adapter to satisfy existing StdlibRegistry interface - stdlibRegistry := &core.StdlibRegistry{ - Modules: make(map[string]*core.StdlibModule), - Manifest: remoteLoader.Manifest, - } - - // The remote loader will lazy-load modules as needed - // We store a reference to it for on-demand loading - typeEngine.StdlibRegistry = stdlibRegistry - typeEngine.StdlibRemote = remoteLoader - - log.Printf("Loaded stdlib manifest from CDN: %d modules available", remoteLoader.ModuleCount()) - } - - // First, index all function definitions from the code graph - // This builds the Functions map for quick lookup - indexFunctions(codeGraph, callGraph, registry) - - // Phase 2 Task 9: Extract return types from all functions (first pass) - allReturnStatements := make([]*resolution.ReturnStatement, 0) - for modulePath, filePath := range registry.Modules { - if !strings.HasSuffix(filePath, ".py") { - continue - } - - sourceCode, err := readFileBytes(filePath) - if err != nil { - continue - } - - // Extract return types - returns, err := resolution.ExtractReturnTypes(filePath, sourceCode, modulePath, typeEngine.Builtins) - if err != nil { - continue - } - - allReturnStatements = append(allReturnStatements, returns...) - } - - // Merge return types and add to engine - mergedReturns := resolution.MergeReturnTypes(allReturnStatements) - typeEngine.AddReturnTypesToEngine(mergedReturns) - - // Phase 2 Task 8: Extract ALL variable assignments BEFORE resolving calls (second pass) - for _, filePath := range registry.Modules { - if !strings.HasSuffix(filePath, ".py") { - continue - } - - sourceCode, err := readFileBytes(filePath) - if err != nil { - continue - } - - // Extract variable assignments for type inference - _ = extraction.ExtractVariableAssignments(filePath, sourceCode, typeEngine, registry, typeEngine.Builtins) - } - - // Phase 2 Task 8: Resolve call: placeholders with return types - // This MUST happen before we start resolving call sites! - typeEngine.UpdateVariableBindingsWithFunctionReturns() - - // Phase 3 Task 12: Extract class attributes (third pass) - for modulePath, filePath := range registry.Modules { - if !strings.HasSuffix(filePath, ".py") { - continue - } - - sourceCode, err := readFileBytes(filePath) - if err != nil { - continue - } - - // Extract class attributes for self.attr tracking - _ = extraction.ExtractClassAttributes(filePath, sourceCode, modulePath, typeEngine, typeEngine.Attributes) - } - - // Phase 3 Task 12: Resolve placeholder types in attributes (Pass 3) - resolution.ResolveAttributePlaceholders(typeEngine.Attributes, typeEngine, registry, codeGraph) - - // Process each Python file in the project (fourth pass for call site resolution) - for modulePath, filePath := range registry.Modules { - // Skip non-Python files - if !strings.HasSuffix(filePath, ".py") { - continue - } - - // Read source code for parsing - sourceCode, err := readFileBytes(filePath) - if err != nil { - // Skip files we can't read - continue - } - - // Extract imports using cache (avoids re-parsing if already cached) - importMap, err := importCache.GetOrExtract(filePath, sourceCode, registry) - if err != nil { - // Skip files with import errors - continue - } - - // Extract all call sites from this file - callSites, err := resolution.ExtractCallSites(filePath, sourceCode, importMap) - if err != nil { - // Skip files with call site extraction errors - continue - } - - // Get all function definitions in this file - fileFunctions := getFunctionsInFile(codeGraph, filePath) - - // Process each call site to resolve targets and build edges - for _, callSite := range callSites { - // Find the caller function containing this call site - callerFQN := findContainingFunction(callSite.Location, fileFunctions, modulePath) - if callerFQN == "" { - // Call at module level - use module name as caller - callerFQN = modulePath - } - - // Resolve the call target to a fully qualified name - targetFQN, resolved, typeInfo := resolveCallTarget(callSite.Target, importMap, registry, modulePath, codeGraph, typeEngine, callerFQN, callGraph) - - // Update call site with resolution information - callSite.TargetFQN = targetFQN - callSite.Resolved = resolved - - // Phase 2 Task 10: Populate type inference metadata - if typeInfo != nil { - callSite.ResolvedViaTypeInference = true - callSite.InferredType = typeInfo.TypeFQN - callSite.TypeConfidence = typeInfo.Confidence - callSite.TypeSource = typeInfo.Source - } - - // If resolution failed, categorize the failure reason - if !resolved { - callSite.FailureReason = categorizeResolutionFailure(callSite.Target, targetFQN) - } - - // Add call site to graph (dereference pointer) - callGraph.AddCallSite(callerFQN, *callSite) - - // Add edge if we successfully resolved the target - if resolved { - callGraph.AddEdge(callerFQN, targetFQN) - } - } - } - - // Phase 3 Task 12: Print attribute failure analysis - resolution.PrintAttributeFailureStats() - - // Pass 5: Generate taint summaries for all functions - log.Printf("Pass 5: Generating taint summaries...") - generateTaintSummaries(callGraph, codeGraph, registry) - log.Printf("Generated taint summaries for %d functions", len(callGraph.Summaries)) - - return callGraph, nil + return cgbuilder.BuildCallGraph(codeGraph, registry, projectRoot) } -// indexFunctions builds the Functions map in the call graph. -// Extracts all function definitions from the code graph and maps them by FQN. +// resolveCallTarget is a wrapper for backward compatibility with tests. // -// Parameters: -// - codeGraph: the parsed code graph -// - callGraph: the call graph being built -// - registry: module registry for resolving file paths to modules -func indexFunctions(codeGraph *graph.CodeGraph, callGraph *core.CallGraph, registry *core.ModuleRegistry) { - for _, node := range codeGraph.Nodes { - // Only index function/method definitions - if node.Type != "method_declaration" && node.Type != "function_definition" { - continue - } - - // Get the module path for this function's file - modulePath, ok := registry.FileToModule[node.File] - if !ok { - continue - } - - // Build fully qualified name: module.function - fqn := modulePath + "." + node.Name - callGraph.Functions[fqn] = node - } +// Deprecated: Use builder.ResolveCallTarget instead. +func resolveCallTarget(target string, importMap *core.ImportMap, registry *core.ModuleRegistry, currentModule string, codeGraph *graph.CodeGraph, typeEngine *resolution.TypeInferenceEngine, callerFQN string, _ *core.CallGraph) (string, bool, *core.TypeInfo) { + return cgbuilder.ResolveCallTarget(target, importMap, registry, currentModule, codeGraph, typeEngine, callerFQN, nil) } -// getFunctionsInFile returns all function definitions in a specific file. -// -// Parameters: -// - codeGraph: the parsed code graph -// - filePath: absolute path to the file -// -// Returns: -// - List of function/method nodes in the file, sorted by line number -func getFunctionsInFile(codeGraph *graph.CodeGraph, filePath string) []*graph.Node { - var functions []*graph.Node - - for _, node := range codeGraph.Nodes { - if node.File == filePath && - (node.Type == "method_declaration" || node.Type == "function_definition") { - functions = append(functions, node) - } - } - - return functions -} - -// findContainingFunction finds the function that contains a given call site location. -// Uses line numbers to determine which function a call belongs to. -// -// Algorithm: -// 1. Iterate through all functions in the file -// 2. Find function with the highest line number that's still <= call line -// 3. Return the FQN of that function -// -// Parameters: -// - location: source location of the call site -// - functions: all function definitions in the file -// - modulePath: module path of the file -// -// Returns: -// - Fully qualified name of the containing function, or empty if not found -func findContainingFunction(location core.Location, functions []*graph.Node, modulePath string) string { - // In Python, module-level code has no indentation (column == 1) - // If the call site is at column 1, it's module-level, not inside any function - if location.Column == 1 { - return "" - } - - var bestMatch *graph.Node - var bestLine uint32 - - for _, fn := range functions { - // Check if call site is after this function definition - if uint32(location.Line) >= fn.LineNumber { - // Keep track of the closest preceding function - if bestMatch == nil || fn.LineNumber > bestLine { - bestMatch = fn - bestLine = fn.LineNumber - } - } - } - - if bestMatch != nil { - return modulePath + "." + bestMatch.Name - } - - return "" -} - -// resolveCallTarget resolves a call target name to a fully qualified name. -// This is the core resolution logic that handles: -// - Direct function calls: sanitize() → myapp.utils.sanitize -// - Method calls: obj.method() → (unresolved, needs type inference) -// - Imported functions: from utils import sanitize; sanitize() → myapp.utils.sanitize -// - Qualified calls: utils.sanitize() → myapp.utils.sanitize -// -// Algorithm: -// 1. Check if target is a simple name (no dots) -// a. Look up in import map -// b. If found, return FQN from import -// c. If not found, try to find in same module -// 2. If target has dots (qualified name) -// a. Split into base and rest -// b. Resolve base using import map -// c. Append rest to get full FQN -// 3. If all else fails, check if it exists in the registry -// -// Parameters: -// - target: the call target name (e.g., "sanitize", "utils.sanitize", "obj.method") -// - importMap: import mappings for the current file -// - registry: module registry for validation -// - currentModule: the module containing this call -// -// Returns: -// - Fully qualified name of the target -// - Boolean indicating if resolution was successful -// -// Examples: -// target="sanitize", imports={"sanitize": "myapp.utils.sanitize"} -// → "myapp.utils.sanitize", true +// findFunctionAtLine is a wrapper for backward compatibility with tests. // -// target="utils.sanitize", imports={"utils": "myapp.utils"} -// → "myapp.utils.sanitize", true -// -// target="obj.method", imports={} -// → "obj.method", false (needs type inference) - -// Python built-in functions that should not be resolved as module functions. -var pythonBuiltins = map[string]bool{ - "eval": true, - "exec": true, - "input": true, - "raw_input": true, - "compile": true, - "__import__": true, +// Deprecated: Use builder.FindFunctionAtLine instead. +func findFunctionAtLine(root *sitter.Node, lineNumber uint32) *sitter.Node { + return cgbuilder.FindFunctionAtLine(root, lineNumber) } -// categorizeResolutionFailure determines why a call target failed to resolve. -// This enables diagnostic reporting to understand resolution gaps. +// generateTaintSummaries is a wrapper for backward compatibility with tests. // -// Categories: -// - "external_framework" - Known external frameworks (Django, REST, pytest, stdlib) -// - "orm_pattern" - Django ORM patterns (Model.objects.*, queryset.*) -// - "attribute_chain" - Method calls on objects/return values -// - "variable_method" - Method calls that appear to be on variables -// - "super_call" - Calls via super() mechanism -// - "not_in_imports" - Simple name not found in imports -// - "unknown" - Other unresolved patterns -// -// Parameters: -// - target: original call target string (e.g., "models.ForeignKey") -// - targetFQN: resolved fully qualified name (e.g., "django.db.models.ForeignKey") -// -// Returns: -// - category string describing the failure reason -func categorizeResolutionFailure(target, targetFQN string) string { - // Check for external frameworks (common patterns) - if strings.HasPrefix(targetFQN, "django.") || - strings.HasPrefix(targetFQN, "rest_framework.") || - strings.HasPrefix(targetFQN, "pytest.") || - strings.HasPrefix(targetFQN, "unittest.") || - strings.HasPrefix(targetFQN, "json.") || - strings.HasPrefix(targetFQN, "logging.") || - strings.HasPrefix(targetFQN, "os.") || - strings.HasPrefix(targetFQN, "sys.") || - strings.HasPrefix(targetFQN, "re.") || - strings.HasPrefix(targetFQN, "pathlib.") || - strings.HasPrefix(targetFQN, "collections.") || - strings.HasPrefix(targetFQN, "datetime.") { - return "external_framework" - } - - // Check for Django ORM patterns - if strings.Contains(target, ".objects.") || - strings.HasSuffix(target, ".objects") || - (strings.Contains(target, ".") && (strings.HasSuffix(target, ".filter") || - strings.HasSuffix(target, ".get") || - strings.HasSuffix(target, ".create") || - strings.HasSuffix(target, ".update") || - strings.HasSuffix(target, ".delete") || - strings.HasSuffix(target, ".all") || - strings.HasSuffix(target, ".first") || - strings.HasSuffix(target, ".last") || - strings.HasSuffix(target, ".count") || - strings.HasSuffix(target, ".exists"))) { - return "orm_pattern" - } - - // Check for super() calls - if strings.HasPrefix(target, "super(") || strings.HasPrefix(target, "super.") { - return "super_call" - } - - // Check for attribute chains (has dots, looks like obj.method()) - // Heuristic: lowercase first component likely means variable/object - if dotIndex := strings.Index(target, "."); dotIndex != -1 { - firstComponent := target[:dotIndex] - // If starts with lowercase and not a known module pattern, likely attribute chain - if len(firstComponent) > 0 && firstComponent[0] >= 'a' && firstComponent[0] <= 'z' { - // Could be variable method or attribute chain - // Check common variable-like patterns - if firstComponent == "self" || firstComponent == "cls" || - firstComponent == "request" || firstComponent == "response" || - firstComponent == "queryset" || firstComponent == "user" || - firstComponent == "obj" || firstComponent == "value" || - firstComponent == "data" || firstComponent == "result" { - return "variable_method" - } - return "attribute_chain" - } - } - - // Simple name (no dots) - not in imports - if !strings.Contains(target, ".") { - return "not_in_imports" - } - - // Everything else - return "unknown" -} - -func resolveCallTarget(target string, importMap *core.ImportMap, registry *core.ModuleRegistry, currentModule string, codeGraph *graph.CodeGraph, typeEngine *resolution.TypeInferenceEngine, callerFQN string, callGraph *core.CallGraph) (string, bool, *core.TypeInfo) { - // Backward compatibility: if typeEngine or callerFQN not provided, skip type inference - if typeEngine == nil || callerFQN == "" { - fqn, resolved := resolveCallTargetLegacy(target, importMap, registry, currentModule, codeGraph) - return fqn, resolved, nil - } - - // Phase 3 Task 11: Check for method chaining BEFORE other resolution - // Chains have pattern "()." indicating call followed by attribute access - if strings.Contains(target, ").") { - chainFQN, chainResolved, chainType := ResolveChainedCall( - target, - typeEngine, - typeEngine.Builtins, - registry, - codeGraph, - callerFQN, - currentModule, - callGraph, - ) - if chainResolved { - return chainFQN, true, chainType - } - // Chain parsing attempted but failed - fall through to regular resolution - } - - // Phase 3 Task 12: Check for self.attribute.method() patterns BEFORE self.method() - // Pattern: self.attr.method (2+ dots starting with self.) - if strings.HasPrefix(target, "self.") && strings.Count(target, ".") >= 2 { - attrFQN, attrResolved, attrType := resolution.ResolveSelfAttributeCall( - target, - callerFQN, - typeEngine, - typeEngine.Builtins, - callGraph, - ) - if attrResolved { - return attrFQN, true, attrType - } - // Attribute resolution attempted but failed - fall through - } - - // Handle self.method() calls - resolve to current module - if strings.HasPrefix(target, "self.") { - methodName := strings.TrimPrefix(target, "self.") - // Resolve to module.method - moduleFQN := currentModule + "." + methodName - // Validate exists - if validateFQN(moduleFQN, registry) { - return moduleFQN, true, nil - } - // Return unresolved but with module prefix - return moduleFQN, false, nil - } - - // Handle simple names (no dots) - if !strings.Contains(target, ".") { - // Check if it's a Python built-in - if pythonBuiltins[target] { - // Return as builtins.function for pattern matching - return "builtins." + target, true, nil - } - - // Try to resolve through imports - if fqn, ok := importMap.Resolve(target); ok { - // Found in imports - return the FQN - // Check if it's a known framework - if isKnown, _ := IsKnownFramework(fqn); isKnown { - return fqn, true, nil - } - // Validate if it exists in registry - resolved := validateFQN(fqn, registry) - return fqn, resolved, nil - } - - // Not in imports - might be in same module - sameLevelFQN := currentModule + "." + target - if validateFQN(sameLevelFQN, registry) { - return sameLevelFQN, true, nil - } - - // Can't resolve - return as-is - return target, false, nil - } - - // Handle qualified names (with dots) - parts := strings.SplitN(target, ".", 2) - base := parts[0] - rest := parts[1] - - // Phase 2 Task 9: Try type inference for variable.method() calls - if typeEngine != nil && callerFQN != "" { - // Try function scope first, then fall back to module scope - var binding *resolution.VariableBinding - - // Check function scope first - functionScope := typeEngine.GetScope(callerFQN) - if functionScope != nil { - if b, exists := functionScope.Variables[base]; exists { - binding = b - } - } - - // If not found in function scope, try module scope - if binding == nil { - moduleScope := typeEngine.GetScope(currentModule) - if moduleScope != nil { - if b, exists := moduleScope.Variables[base]; exists { - binding = b - } - } - } - - if binding != nil { - // Check if variable has type information - if binding.Type != nil { - typeFQN := binding.Type.TypeFQN - - // Skip placeholders (call:, var:) - not yet resolved - if strings.HasPrefix(typeFQN, "call:") || strings.HasPrefix(typeFQN, "var:") { - // Continue to legacy resolution - } else { - // Check if it's a builtin type - if typeEngine.Builtins != nil && strings.HasPrefix(typeFQN, "builtins.") { - method := typeEngine.Builtins.GetMethod(typeFQN, rest) - if method != nil { - // Resolved to builtin method - return with type info - return typeFQN + "." + rest, true, binding.Type - } - } - - // Check if it's a project type (user-defined class/method) - methodFQN := typeFQN + "." + rest - - // Validate method exists in code graph - if codeGraph != nil { - if node, ok := codeGraph.Nodes[methodFQN]; ok { - if node.Type == "method_declaration" || node.Type == "function_definition" { - // Resolved via code graph validation - return with type info - return methodFQN, true, binding.Type - } - } - - // Python class methods are stored at module level (e.g., test.save, not test.User.save) - // Try stripping the class name and looking for module.method - lastDot := strings.LastIndex(typeFQN, ".") - if lastDot >= 0 { - modulePart := typeFQN[:lastDot] - className := typeFQN[lastDot+1:] - - // Check if it looks like a Python class (PascalCase) - if len(className) > 0 && className[0] >= 'A' && className[0] <= 'Z' { - pythonMethodFQN := modulePart + "." + rest - if callGraph != nil { - if node, ok := callGraph.Functions[pythonMethodFQN]; ok { - if node.Type == "method_declaration" || node.Type == "function_definition" { - // Resolved via Python module-level method lookup - return pythonMethodFQN, true, binding.Type - } - } - } - } - } - } - - // Heuristic: If type has good confidence (>= 0.7), assume method exists - if binding.Type.Confidence >= 0.7 { - // Resolved via confidence heuristic - return with type info - return methodFQN, true, binding.Type - } - - } - } - } - } - - // Try to resolve base through imports - if baseFQN, ok := importMap.Resolve(base); ok { - fullFQN := baseFQN + "." + rest - // Check if it's a known framework - if isKnown, _ := IsKnownFramework(fullFQN); isKnown { - return fullFQN, true, nil - } - // Check if it's an ORM pattern (before validateFQN, since ORM methods don't exist in source) - if ormFQN, resolved := ResolveORMCall(target, currentModule, registry, codeGraph); resolved { - return ormFQN, true, nil - } - // PR #3: Check stdlib registry before user project registry - if typeEngine != nil && typeEngine.StdlibRemote != nil { - if remoteLoader, ok := typeEngine.StdlibRemote.(*cgregistry.StdlibRegistryRemote); ok { - if validateStdlibFQN(fullFQN, remoteLoader) { - return fullFQN, true, nil - } - } - } - if validateFQN(fullFQN, registry) { - return fullFQN, true, nil - } - return fullFQN, false, nil - } - - // Base not in imports - might be module-level access - // Try current module - fullFQN := currentModule + "." + target - if validateFQN(fullFQN, registry) { - return fullFQN, true, nil - } - - // Before giving up, check if it's an ORM pattern (Django, SQLAlchemy, etc.) - // ORM methods are dynamically generated at runtime and won't be in source - if ormFQN, resolved := ResolveORMCall(target, currentModule, registry, codeGraph); resolved { - return ormFQN, true, nil - } - - // PR #3: Last resort - check if target is a stdlib call (e.g., os.path.join) - // This handles cases where stdlib modules are imported directly (import os.path) - if typeEngine != nil && typeEngine.StdlibRemote != nil { - if remoteLoader, ok := typeEngine.StdlibRemote.(*cgregistry.StdlibRegistryRemote); ok { - if validateStdlibFQN(target, remoteLoader) { - return target, true, nil - } - } - } - - // Can't resolve - return as-is - return target, false, nil +// Deprecated: Use builder.GenerateTaintSummaries instead. +func generateTaintSummaries(callGraph *core.CallGraph, codeGraph *graph.CodeGraph, registry *core.ModuleRegistry) { + cgbuilder.GenerateTaintSummaries(callGraph, codeGraph, registry) } -// stdlibModuleAliases maps platform-specific module aliases to their canonical names. -// For example, os.path is posixpath on Unix/Linux/Mac and ntpath on Windows. -var stdlibModuleAliases = map[string]string{ - "os.path": "posixpath", // On POSIX systems (Unix, Linux, macOS) - // Note: On Windows, os.path would be ntpath, but we default to POSIX - // since most development happens on Unix-like systems -} +// Note: detectPythonVersion is defined in python_version_detector.go and delegates to builder package. -// validateStdlibFQN checks if a fully qualified name is a stdlib function. -// Supports module.function, module.submodule.function, and module.Class patterns. -// Handles platform-specific module aliases (e.g., os.path -> posixpath). -// Uses lazy loading via remote registry to download modules on-demand. +// validateStdlibFQN is a wrapper for backward compatibility with tests. // -// Examples: -// "os.getcwd" - returns true if os.getcwd exists in stdlib -// "os.path.join" - returns true if posixpath.join exists in stdlib (alias resolution) -// "json.dumps" - returns true if json.dumps exists in stdlib -// -// Parameters: -// - fqn: fully qualified name to check -// - remoteLoader: remote stdlib registry loader -// -// Returns: -// - true if FQN is a stdlib function or class +// Deprecated: Use builder.ValidateStdlibFQN instead. func validateStdlibFQN(fqn string, remoteLoader *cgregistry.StdlibRegistryRemote) bool { - if remoteLoader == nil { - return false - } - - // Split FQN into parts: os.path.join -> ["os", "path", "join"] - parts := strings.Split(fqn, ".") - if len(parts) < 2 { - return false - } - - // Try different module combinations - // For "os.path.join", try: - // 1. module="os.path", function="join" (with alias resolution) - // 2. module="os", function="path.join" - // 3. module="os", function="path" (submodule) - - // Try longest match first (os.path) - for i := len(parts) - 1; i >= 1; i-- { - moduleName := strings.Join(parts[:i], ".") - functionName := parts[i] - - // Check if this module is an alias (e.g., os.path -> posixpath) - if canonicalName, isAlias := stdlibModuleAliases[moduleName]; isAlias { - moduleName = canonicalName - } - - // Lazy load module from remote registry - module, err := remoteLoader.GetModule(moduleName) - if err != nil { - log.Printf("Warning: Failed to load stdlib module %s: %v", moduleName, err) - continue - } - if module == nil { - continue - } - - // Check if it's a function - if _, ok := module.Functions[functionName]; ok { - return true - } - - // Check if it's a class - if _, ok := module.Classes[functionName]; ok { - return true - } - - // Check if it's a constant - if _, ok := module.Constants[functionName]; ok { - return true - } - - // Check if it's an attribute - if _, ok := module.Attributes[functionName]; ok { - return true - } - } - - return false + return cgbuilder.ValidateStdlibFQN(fqn, remoteLoader) } -// validateFQN checks if a fully qualified name exists in the registry. -// Handles both module names and function names within modules. -// -// Examples: -// "myapp.utils" - checks if module exists -// "myapp.utils.sanitize" - checks if module "myapp.utils" exists -// -// Parameters: -// - fqn: fully qualified name to validate -// - registry: module registry +// validateFQN is a wrapper for backward compatibility with tests. // -// Returns: -// - true if FQN is valid (module or function in existing module) +// Deprecated: Use builder.ValidateFQN instead. func validateFQN(fqn string, registry *core.ModuleRegistry) bool { - // Check if it's a module - if _, ok := registry.Modules[fqn]; ok { - return true - } - - // Check if parent module exists (for functions) - // "myapp.utils.sanitize" → check if "myapp.utils" exists - lastDot := strings.LastIndex(fqn, ".") - if lastDot > 0 { - parentModule := fqn[:lastDot] - if _, ok := registry.Modules[parentModule]; ok { - return true - } - } - - return false -} - -// resolveCallTargetLegacy is the old resolution logic without type inference. -// Used for backward compatibility with existing tests. -func resolveCallTargetLegacy(target string, importMap *core.ImportMap, registry *core.ModuleRegistry, currentModule string, codeGraph *graph.CodeGraph) (string, bool) { - // Handle self.method() calls - resolve to current module - if strings.HasPrefix(target, "self.") { - methodName := strings.TrimPrefix(target, "self.") - // Resolve to module.method - moduleFQN := currentModule + "." + methodName - // Validate exists - if validateFQN(moduleFQN, registry) { - return moduleFQN, true - } - // Return unresolved but with module prefix - return moduleFQN, false - } - - // Handle simple names (no dots) - if !strings.Contains(target, ".") { - // Check if it's a Python built-in - if pythonBuiltins[target] { - // Return as builtins.function for pattern matching - return "builtins." + target, true - } - - // Try to resolve through imports - if fqn, ok := importMap.Resolve(target); ok { - // Found in imports - return the FQN - // Check if it's a known framework - if isKnown, _ := IsKnownFramework(fqn); isKnown { - return fqn, true - } - // Validate if it exists in registry - resolved := validateFQN(fqn, registry) - return fqn, resolved - } - - // Not in imports - might be in same module - sameLevelFQN := currentModule + "." + target - if validateFQN(sameLevelFQN, registry) { - return sameLevelFQN, true - } - - // Can't resolve - return as-is - return target, false - } - - // Handle qualified names (with dots) - parts := strings.SplitN(target, ".", 2) - base := parts[0] - rest := parts[1] - - // Try to resolve base through imports - if baseFQN, ok := importMap.Resolve(base); ok { - fullFQN := baseFQN + "." + rest - // Check if it's a known framework - if isKnown, _ := IsKnownFramework(fullFQN); isKnown { - return fullFQN, true - } - // Check if it's an ORM pattern (before validateFQN, since ORM methods don't exist in source) - if ormFQN, resolved := ResolveORMCall(target, currentModule, registry, codeGraph); resolved { - return ormFQN, true - } - if validateFQN(fullFQN, registry) { - return fullFQN, true - } - return fullFQN, false - } - - // Base not in imports - might be module-level access - // Try current module - fullFQN := currentModule + "." + target - if validateFQN(fullFQN, registry) { - return fullFQN, true - } - - // Before giving up, check if it's an ORM pattern (Django, SQLAlchemy, etc.) - // ORM methods are dynamically generated at runtime and won't be in source - if ormFQN, resolved := ResolveORMCall(target, currentModule, registry, codeGraph); resolved { - return ormFQN, true - } - - // Can't resolve - return as-is - return target, false + return cgbuilder.ValidateFQN(fqn, registry) } -// readFileBytes reads a file and returns its contents as a byte slice. -// Helper function for reading source code. -func readFileBytes(filePath string) ([]byte, error) { - absPath, err := filepath.Abs(filePath) - if err != nil { - return nil, err - } - return os.ReadFile(absPath) +// indexFunctions is a wrapper for backward compatibility with tests. +// +// Deprecated: Use builder.IndexFunctions instead. +func indexFunctions(codeGraph *graph.CodeGraph, callGraph *core.CallGraph, registry *core.ModuleRegistry) { + cgbuilder.IndexFunctions(codeGraph, callGraph, registry) } -// generateTaintSummaries analyzes all Python functions for taint flows. -// This is Pass 5 of the call graph building process. -// -// For each function: -// 1. Extract statements from AST -// 2. Build def-use chains -// 3. Analyze intra-procedural taint -// 4. Store TaintSummary in callGraph.Summaries +// getFunctionsInFile is a wrapper for backward compatibility with tests. // -// Parameters: -// - callGraph: the call graph being built (will be populated with summaries) -// - codeGraph: the parsed AST nodes (currently unused, reserved for future use) -// - registry: module registry (currently unused, reserved for future use) -func generateTaintSummaries(callGraph *core.CallGraph, codeGraph *graph.CodeGraph, registry *core.ModuleRegistry) { - _ = codeGraph // Reserved for future use - _ = registry // Reserved for future use - analyzed := 0 - total := len(callGraph.Functions) - - // Iterate over all indexed functions - for funcFQN, funcNode := range callGraph.Functions { - // Read source code for this function's file - sourceCode, err := readFileBytes(funcNode.File) - if err != nil { - log.Printf("Warning: failed to read file %s for taint analysis: %v", funcNode.File, err) - continue - } - - // Parse the Python file to get AST - tree, err := extraction.ParsePythonFile(sourceCode) - if err != nil { - log.Printf("Warning: failed to parse %s for taint analysis: %v", funcNode.File, err) - continue - } - - // Find the function node in the AST by line number - functionNode := findFunctionAtLine(tree.RootNode(), funcNode.LineNumber) - if functionNode == nil { - log.Printf("Warning: could not find function %s at line %d", funcFQN, funcNode.LineNumber) - if tree != nil { - tree.Close() - } - continue - } - - // Step 1: Extract statements from function - statements, err := extraction.ExtractStatements(funcNode.File, sourceCode, functionNode) - if err != nil { - log.Printf("Warning: failed to extract statements from %s: %v", funcFQN, err) - if tree != nil { - tree.Close() - } - continue - } - - // Step 2: Build def-use chains - defUseChain := core.BuildDefUseChains(statements) - - // Step 3: Analyze intra-procedural taint - // For MVP: use empty sources/sinks/sanitizers (will be populated from patterns in PR #6) - summary := taint.AnalyzeIntraProceduralTaint( - funcFQN, - statements, - defUseChain, - []string{}, // sources - will come from patterns - []string{}, // sinks - will come from patterns - []string{}, // sanitizers - will come from patterns - ) - - // Step 4: Store summary - callGraph.Summaries[funcFQN] = summary - - analyzed++ - - // Report progress every 1000 functions - if analyzed%1000 == 0 { - log.Printf("Analyzed %d/%d functions...", analyzed, total) - } - - // Clean up tree-sitter tree - if tree != nil { - tree.Close() - } - } +// Deprecated: Use builder.GetFunctionsInFile instead. +func getFunctionsInFile(codeGraph *graph.CodeGraph, filePath string) []*graph.Node { + return cgbuilder.GetFunctionsInFile(codeGraph, filePath) } -// findFunctionAtLine searches for a function definition at the specified line number. -// Returns the tree-sitter node for the function, or nil if not found. -func findFunctionAtLine(root *sitter.Node, lineNumber uint32) *sitter.Node { - if root == nil { - return nil - } - - // Check if this node is a function definition at the target line - if (root.Type() == "function_definition" || root.Type() == "method_declaration") && - root.StartPoint().Row+1 == lineNumber { - return root - } - - // Recursively search children - for i := 0; i < int(root.ChildCount()); i++ { - if result := findFunctionAtLine(root.Child(i), lineNumber); result != nil { - return result - } - } - - return nil +// findContainingFunction is a wrapper for backward compatibility with tests. +// +// Deprecated: Use builder.FindContainingFunction instead. +func findContainingFunction(location core.Location, functions []*graph.Node, modulePath string) string { + return cgbuilder.FindContainingFunction(location, functions, modulePath) } diff --git a/sourcecode-parser/graph/callgraph/builder/builder.go b/sourcecode-parser/graph/callgraph/builder/builder.go new file mode 100644 index 00000000..3800b0e7 --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/builder.go @@ -0,0 +1,1011 @@ +package builder + +import ( + "log" + "path/filepath" + "strings" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/extraction" + cgregistry "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/registry" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/resolution" +) + +// BuildCallGraph constructs the complete call graph for a Python project. +// This is Pass 3 of the 3-pass algorithm: +// - Pass 1: BuildModuleRegistry - map files to modules +// - Pass 2: ExtractImports + ExtractCallSites - parse imports and calls +// - Pass 3: BuildCallGraph - resolve calls and build graph +// +// Algorithm: +// 1. For each Python file in the project: +// a. Extract imports to build ImportMap +// b. Extract call sites from AST +// c. Extract function definitions from main graph +// 2. For each call site: +// a. Resolve target name using ImportMap +// b. Find target function definition in registry +// c. Add edge from caller to callee +// d. Store detailed call site information +// +// Parameters: +// - codeGraph: the existing code graph with parsed AST nodes +// - registry: module registry mapping files to modules +// - projectRoot: absolute path to project root +// +// Returns: +// - CallGraph: complete call graph with edges and call sites +// - error: if any step fails +// +// Example: +// Given: +// File: myapp/views.py +// def get_user(): +// sanitize(data) # call to myapp.utils.sanitize +// +// Creates: +// edges: {"myapp.views.get_user": ["myapp.utils.sanitize"]} +// reverseEdges: {"myapp.utils.sanitize": ["myapp.views.get_user"]} +// callSites: {"myapp.views.get_user": [CallSite{Target: "sanitize", ...}]} +func BuildCallGraph(codeGraph *graph.CodeGraph, registry *core.ModuleRegistry, projectRoot string) (*core.CallGraph, error) { + callGraph := core.NewCallGraph() + + // Initialize import map cache for performance + // This avoids re-parsing imports from the same file multiple times + importCache := NewImportMapCache() + + // Initialize type inference engine + typeEngine := resolution.NewTypeInferenceEngine(registry) + typeEngine.Builtins = cgregistry.NewBuiltinRegistry() + + // Phase 3 Task 12: Initialize attribute registry for tracking class attributes + typeEngine.Attributes = cgregistry.NewAttributeRegistry() + + // PR #3: Detect Python version and load stdlib registry from remote CDN + pythonVersion := DetectPythonVersion(projectRoot) + log.Printf("Detected Python version: %s", pythonVersion) + + // Create remote registry loader + remoteLoader := cgregistry.NewStdlibRegistryRemote( + "https://codepathfinder.dev/assets/registries", + pythonVersion, + ) + + // Load manifest from CDN + err := remoteLoader.LoadManifest() + if err != nil { + log.Printf("Warning: Failed to load stdlib registry from CDN: %v", err) + // Continue without stdlib resolution - not a fatal error + } else { + // Create adapter to satisfy existing StdlibRegistry interface + stdlibRegistry := &core.StdlibRegistry{ + Modules: make(map[string]*core.StdlibModule), + Manifest: remoteLoader.Manifest, + } + + // The remote loader will lazy-load modules as needed + // We store a reference to it for on-demand loading + typeEngine.StdlibRegistry = stdlibRegistry + typeEngine.StdlibRemote = remoteLoader + + log.Printf("Loaded stdlib manifest from CDN: %d modules available", remoteLoader.ModuleCount()) + } + + // First, index all function definitions from the code graph + // This builds the Functions map for quick lookup + indexFunctions(codeGraph, callGraph, registry) + + // Phase 2 Task 9: Extract return types from all functions (first pass) + allReturnStatements := make([]*resolution.ReturnStatement, 0) + for modulePath, filePath := range registry.Modules { + if !strings.HasSuffix(filePath, ".py") { + continue + } + + sourceCode, err := ReadFileBytes(filePath) + if err != nil { + continue + } + + // Extract return types + returns, err := resolution.ExtractReturnTypes(filePath, sourceCode, modulePath, typeEngine.Builtins) + if err != nil { + continue + } + + allReturnStatements = append(allReturnStatements, returns...) + } + + // Merge return types and add to engine + mergedReturns := resolution.MergeReturnTypes(allReturnStatements) + typeEngine.AddReturnTypesToEngine(mergedReturns) + + // Phase 2 Task 8: Extract ALL variable assignments BEFORE resolving calls (second pass) + for _, filePath := range registry.Modules { + if !strings.HasSuffix(filePath, ".py") { + continue + } + + sourceCode, err := ReadFileBytes(filePath) + if err != nil { + continue + } + + // Extract variable assignments for type inference + _ = extraction.ExtractVariableAssignments(filePath, sourceCode, typeEngine, registry, typeEngine.Builtins) + } + + // Phase 2 Task 8: Resolve call: placeholders with return types + // This MUST happen before we start resolving call sites! + typeEngine.UpdateVariableBindingsWithFunctionReturns() + + // Phase 3 Task 12: Extract class attributes (third pass) + for modulePath, filePath := range registry.Modules { + if !strings.HasSuffix(filePath, ".py") { + continue + } + + sourceCode, err := ReadFileBytes(filePath) + if err != nil { + continue + } + + // Extract class attributes for self.attr tracking + _ = extraction.ExtractClassAttributes(filePath, sourceCode, modulePath, typeEngine, typeEngine.Attributes) + } + + // Phase 3 Task 12: Resolve placeholder types in attributes (Pass 3) + resolution.ResolveAttributePlaceholders(typeEngine.Attributes, typeEngine, registry, codeGraph) + + // Process each Python file in the project (fourth pass for call site resolution) + for modulePath, filePath := range registry.Modules { + // Skip non-Python files + if !strings.HasSuffix(filePath, ".py") { + continue + } + + // Read source code for parsing + sourceCode, err := ReadFileBytes(filePath) + if err != nil { + // Skip files we can't read + continue + } + + // Extract imports using cache (avoids re-parsing if already cached) + importMap, err := importCache.GetOrExtract(filePath, sourceCode, registry) + if err != nil { + // Skip files with import errors + continue + } + + // Extract all call sites from this file + callSites, err := resolution.ExtractCallSites(filePath, sourceCode, importMap) + if err != nil { + // Skip files with call site extraction errors + continue + } + + // Get all function definitions in this file + fileFunctions := getFunctionsInFile(codeGraph, filePath) + + // Process each call site to resolve targets and build edges + for _, callSite := range callSites { + // Find the caller function containing this call site + callerFQN := findContainingFunction(callSite.Location, fileFunctions, modulePath) + if callerFQN == "" { + // Call at module level - use module name as caller + callerFQN = modulePath + } + + // Resolve the call target to a fully qualified name + targetFQN, resolved, typeInfo := resolveCallTarget(callSite.Target, importMap, registry, modulePath, codeGraph, typeEngine, callerFQN, callGraph) + + // Update call site with resolution information + callSite.TargetFQN = targetFQN + callSite.Resolved = resolved + + // Phase 2 Task 10: Populate type inference metadata + if typeInfo != nil { + callSite.ResolvedViaTypeInference = true + callSite.InferredType = typeInfo.TypeFQN + callSite.TypeConfidence = typeInfo.Confidence + callSite.TypeSource = typeInfo.Source + } + + // If resolution failed, categorize the failure reason + if !resolved { + callSite.FailureReason = categorizeResolutionFailure(callSite.Target, targetFQN) + } + + // Add call site to graph (dereference pointer) + callGraph.AddCallSite(callerFQN, *callSite) + + // Add edge if we successfully resolved the target + if resolved { + callGraph.AddEdge(callerFQN, targetFQN) + } + } + } + + // Phase 3 Task 12: Print attribute failure analysis + resolution.PrintAttributeFailureStats() + + // Pass 5: Generate taint summaries for all functions + log.Printf("Pass 5: Generating taint summaries...") + GenerateTaintSummaries(callGraph, codeGraph, registry) + log.Printf("Generated taint summaries for %d functions", len(callGraph.Summaries)) + + return callGraph, nil +} + +// IndexFunctions builds the Functions map in the call graph. +// Extracts all function definitions from the code graph and maps them by FQN. +// +// Parameters: +// - codeGraph: the parsed code graph +// - callGraph: the call graph being built +// - registry: module registry for resolving file paths to modules +func IndexFunctions(codeGraph *graph.CodeGraph, callGraph *core.CallGraph, registry *core.ModuleRegistry) { + indexFunctions(codeGraph, callGraph, registry) +} + +// indexFunctions is the internal implementation of IndexFunctions. +func indexFunctions(codeGraph *graph.CodeGraph, callGraph *core.CallGraph, registry *core.ModuleRegistry) { + for _, node := range codeGraph.Nodes { + // Only index function/method definitions + if node.Type != "method_declaration" && node.Type != "function_definition" { + continue + } + + // Get the module path for this function's file + modulePath, ok := registry.FileToModule[node.File] + if !ok { + continue + } + + // Build fully qualified name: module.function + fqn := modulePath + "." + node.Name + callGraph.Functions[fqn] = node + } +} + +// GetFunctionsInFile returns all function definitions in a specific file. +// +// Parameters: +// - codeGraph: the parsed code graph +// - filePath: absolute path to the file +// +// Returns: +// - List of function/method nodes in the file, sorted by line number +func GetFunctionsInFile(codeGraph *graph.CodeGraph, filePath string) []*graph.Node { + return getFunctionsInFile(codeGraph, filePath) +} + +// getFunctionsInFile is the internal implementation of GetFunctionsInFile. +func getFunctionsInFile(codeGraph *graph.CodeGraph, filePath string) []*graph.Node { + var functions []*graph.Node + + for _, node := range codeGraph.Nodes { + if node.File == filePath && + (node.Type == "method_declaration" || node.Type == "function_definition") { + functions = append(functions, node) + } + } + + return functions +} + +// FindContainingFunction finds the function that contains a given call site location. +// Uses line numbers to determine which function a call belongs to. +// +// Algorithm: +// 1. Iterate through all functions in the file +// 2. Find function with the highest line number that's still <= call line +// 3. Return the FQN of that function +// +// Parameters: +// - location: source location of the call site +// - functions: all function definitions in the file +// - modulePath: module path of the file +// +// Returns: +// - Fully qualified name of the containing function, or empty if not found +func FindContainingFunction(location core.Location, functions []*graph.Node, modulePath string) string { + return findContainingFunction(location, functions, modulePath) +} + +// findContainingFunction is the internal implementation of FindContainingFunction. +func findContainingFunction(location core.Location, functions []*graph.Node, modulePath string) string { + // In Python, module-level code has no indentation (column == 1) + // If the call site is at column 1, it's module-level, not inside any function + if location.Column == 1 { + return "" + } + + var bestMatch *graph.Node + var bestLine uint32 + + for _, fn := range functions { + // Check if call site is after this function definition + if uint32(location.Line) >= fn.LineNumber { + // Keep track of the closest preceding function + if bestMatch == nil || fn.LineNumber > bestLine { + bestMatch = fn + bestLine = fn.LineNumber + } + } + } + + if bestMatch != nil { + return modulePath + "." + bestMatch.Name + } + + return "" +} + +// categorizeResolutionFailure determines why a call target failed to resolve. +// This enables diagnostic reporting to understand resolution gaps. +// +// Categories: +// - "external_framework" - Known external frameworks (Django, REST, pytest, stdlib) +// - "orm_pattern" - Django ORM patterns (Model.objects.*, queryset.*) +// - "attribute_chain" - Method calls on objects/return values +// - "variable_method" - Method calls that appear to be on variables +// - "super_call" - Calls via super() mechanism +// - "not_in_imports" - Simple name not found in imports +// - "unknown" - Other unresolved patterns +// +// Parameters: +// - target: original call target string (e.g., "models.ForeignKey") +// - targetFQN: resolved fully qualified name (e.g., "django.db.models.ForeignKey") +// +// Returns: +// - category string describing the failure reason +func categorizeResolutionFailure(target, targetFQN string) string { + // Check for external frameworks (common patterns) + if strings.HasPrefix(targetFQN, "django.") || + strings.HasPrefix(targetFQN, "rest_framework.") || + strings.HasPrefix(targetFQN, "pytest.") || + strings.HasPrefix(targetFQN, "unittest.") || + strings.HasPrefix(targetFQN, "json.") || + strings.HasPrefix(targetFQN, "logging.") || + strings.HasPrefix(targetFQN, "os.") || + strings.HasPrefix(targetFQN, "sys.") || + strings.HasPrefix(targetFQN, "re.") || + strings.HasPrefix(targetFQN, "pathlib.") || + strings.HasPrefix(targetFQN, "collections.") || + strings.HasPrefix(targetFQN, "datetime.") { + return "external_framework" + } + + // Check for Django ORM patterns + if strings.Contains(target, ".objects.") || + strings.HasSuffix(target, ".objects") || + (strings.Contains(target, ".") && (strings.HasSuffix(target, ".filter") || + strings.HasSuffix(target, ".get") || + strings.HasSuffix(target, ".create") || + strings.HasSuffix(target, ".update") || + strings.HasSuffix(target, ".delete") || + strings.HasSuffix(target, ".all") || + strings.HasSuffix(target, ".first") || + strings.HasSuffix(target, ".last") || + strings.HasSuffix(target, ".count") || + strings.HasSuffix(target, ".exists"))) { + return "orm_pattern" + } + + // Check for super() calls + if strings.HasPrefix(target, "super(") || strings.HasPrefix(target, "super.") { + return "super_call" + } + + // Check for attribute chains (has dots, looks like obj.method()) + // Heuristic: lowercase first component likely means variable/object + if dotIndex := strings.Index(target, "."); dotIndex != -1 { + firstComponent := target[:dotIndex] + // If starts with lowercase and not a known module pattern, likely attribute chain + if len(firstComponent) > 0 && firstComponent[0] >= 'a' && firstComponent[0] <= 'z' { + // Could be variable method or attribute chain + // Check common variable-like patterns + if firstComponent == "self" || firstComponent == "cls" || + firstComponent == "request" || firstComponent == "response" || + firstComponent == "queryset" || firstComponent == "user" || + firstComponent == "obj" || firstComponent == "value" || + firstComponent == "data" || firstComponent == "result" { + return "variable_method" + } + return "attribute_chain" + } + } + + // Simple name (no dots) - not in imports + if !strings.Contains(target, ".") { + return "not_in_imports" + } + + // Everything else + return "unknown" +} + +// Python built-in functions that should not be resolved as module functions. +var pythonBuiltins = map[string]bool{ + "eval": true, + "exec": true, + "input": true, + "raw_input": true, + "compile": true, + "__import__": true, +} + +// ResolveCallTarget resolves a call target name to a fully qualified name. +// This is the core resolution logic that handles: +// - Direct function calls: sanitize() → myapp.utils.sanitize +// - Method calls: obj.method() → (unresolved, needs type inference) +// - Imported functions: from utils import sanitize; sanitize() → myapp.utils.sanitize +// - Qualified calls: utils.sanitize() → myapp.utils.sanitize +// +// Algorithm: +// 1. Check if target is a simple name (no dots) +// a. Look up in import map +// b. If found, return FQN from import +// c. If not found, try to find in same module +// 2. If target has dots (qualified name) +// a. Split into base and rest +// b. Resolve base using import map +// c. Append rest to get full FQN +// 3. If all else fails, check if it exists in the registry +// +// Parameters: +// - target: the call target name (e.g., "sanitize", "utils.sanitize", "obj.method") +// - importMap: import mappings for the current file +// - registry: module registry for validation +// - currentModule: the module containing this call +// - codeGraph: the parsed code graph for validation +// - typeEngine: type inference engine +// - callerFQN: fully qualified name of the calling function +// - callGraph: the call graph being built +// +// Returns: +// - Fully qualified name of the target +// - Boolean indicating if resolution was successful +// - TypeInfo if resolved via type inference +// +// Examples: +// target="sanitize", imports={"sanitize": "myapp.utils.sanitize"} +// → "myapp.utils.sanitize", true, nil +// +// target="utils.sanitize", imports={"utils": "myapp.utils"} +// → "myapp.utils.sanitize", true, nil +// +// target="obj.method", imports={} +// → "obj.method", false, nil (needs type inference) +func ResolveCallTarget(target string, importMap *core.ImportMap, registry *core.ModuleRegistry, currentModule string, codeGraph *graph.CodeGraph, typeEngine *resolution.TypeInferenceEngine, callerFQN string, callGraph *core.CallGraph) (string, bool, *core.TypeInfo) { + return resolveCallTarget(target, importMap, registry, currentModule, codeGraph, typeEngine, callerFQN, callGraph) +} + +// resolveCallTarget is the internal implementation of ResolveCallTarget. +func resolveCallTarget(target string, importMap *core.ImportMap, registry *core.ModuleRegistry, currentModule string, codeGraph *graph.CodeGraph, typeEngine *resolution.TypeInferenceEngine, callerFQN string, callGraph *core.CallGraph) (string, bool, *core.TypeInfo) { + // Backward compatibility: if typeEngine or callerFQN not provided, skip type inference + if typeEngine == nil || callerFQN == "" { + fqn, resolved := resolveCallTargetLegacy(target, importMap, registry, currentModule, codeGraph) + return fqn, resolved, nil + } + + // Phase 3 Task 11: Check for method chaining BEFORE other resolution + // Chains have pattern "()." indicating call followed by attribute access + if strings.Contains(target, ").") { + chainFQN, chainResolved, chainType := resolution.ResolveChainedCall( + target, + typeEngine, + typeEngine.Builtins, + registry, + codeGraph, + callerFQN, + currentModule, + callGraph, + ) + if chainResolved { + return chainFQN, true, chainType + } + // Chain parsing attempted but failed - fall through to regular resolution + } + + // Phase 3 Task 12: Check for self.attribute.method() patterns BEFORE self.method() + // Pattern: self.attr.method (2+ dots starting with self.) + if strings.HasPrefix(target, "self.") && strings.Count(target, ".") >= 2 { + attrFQN, attrResolved, attrType := resolution.ResolveSelfAttributeCall( + target, + callerFQN, + typeEngine, + typeEngine.Builtins, + callGraph, + ) + if attrResolved { + return attrFQN, true, attrType + } + // Attribute resolution attempted but failed - fall through + } + + // Handle self.method() calls - resolve to current module + if strings.HasPrefix(target, "self.") { + methodName := strings.TrimPrefix(target, "self.") + // Resolve to module.method + moduleFQN := currentModule + "." + methodName + // Validate exists + if validateFQN(moduleFQN, registry) { + return moduleFQN, true, nil + } + // Return unresolved but with module prefix + return moduleFQN, false, nil + } + + // Handle simple names (no dots) + if !strings.Contains(target, ".") { + // Check if it's a Python built-in + if pythonBuiltins[target] { + // Return as builtins.function for pattern matching + return "builtins." + target, true, nil + } + + // Try to resolve through imports + if fqn, ok := importMap.Resolve(target); ok { + // Found in imports - return the FQN + // Check if it's a known framework + if isKnown, _ := core.IsKnownFramework(fqn); isKnown { + return fqn, true, nil + } + // Validate if it exists in registry + resolved := validateFQN(fqn, registry) + return fqn, resolved, nil + } + + // Not in imports - might be in same module + sameLevelFQN := currentModule + "." + target + if validateFQN(sameLevelFQN, registry) { + return sameLevelFQN, true, nil + } + + // Can't resolve - return as-is + return target, false, nil + } + + // Handle qualified names (with dots) + parts := strings.SplitN(target, ".", 2) + base := parts[0] + rest := parts[1] + + // Phase 2 Task 9: Try type inference for variable.method() calls + if typeEngine != nil && callerFQN != "" { + // Try function scope first, then fall back to module scope + var binding *resolution.VariableBinding + + // Check function scope first + functionScope := typeEngine.GetScope(callerFQN) + if functionScope != nil { + if b, exists := functionScope.Variables[base]; exists { + binding = b + } + } + + // If not found in function scope, try module scope + if binding == nil { + moduleScope := typeEngine.GetScope(currentModule) + if moduleScope != nil { + if b, exists := moduleScope.Variables[base]; exists { + binding = b + } + } + } + + if binding != nil { + // Check if variable has type information + if binding.Type != nil { + typeFQN := binding.Type.TypeFQN + + // Skip placeholders (call:, var:) - not yet resolved + if strings.HasPrefix(typeFQN, "call:") || strings.HasPrefix(typeFQN, "var:") { + // Continue to legacy resolution + } else { + // Check if it's a builtin type + if typeEngine.Builtins != nil && strings.HasPrefix(typeFQN, "builtins.") { + method := typeEngine.Builtins.GetMethod(typeFQN, rest) + if method != nil { + // Resolved to builtin method - return with type info + return typeFQN + "." + rest, true, binding.Type + } + } + + // Check if it's a project type (user-defined class/method) + methodFQN := typeFQN + "." + rest + + // Validate method exists in code graph + if codeGraph != nil { + if node, ok := codeGraph.Nodes[methodFQN]; ok { + if node.Type == "method_declaration" || node.Type == "function_definition" { + // Resolved via code graph validation - return with type info + return methodFQN, true, binding.Type + } + } + + // Python class methods are stored at module level (e.g., test.save, not test.User.save) + // Try stripping the class name and looking for module.method + lastDot := strings.LastIndex(typeFQN, ".") + if lastDot >= 0 { + modulePart := typeFQN[:lastDot] + className := typeFQN[lastDot+1:] + + // Check if it looks like a Python class (PascalCase) + if len(className) > 0 && className[0] >= 'A' && className[0] <= 'Z' { + pythonMethodFQN := modulePart + "." + rest + if callGraph != nil { + if node, ok := callGraph.Functions[pythonMethodFQN]; ok { + if node.Type == "method_declaration" || node.Type == "function_definition" { + // Resolved via Python module-level method lookup + return pythonMethodFQN, true, binding.Type + } + } + } + } + } + } + + // Heuristic: If type has good confidence (>= 0.7), assume method exists + if binding.Type.Confidence >= 0.7 { + // Resolved via confidence heuristic - return with type info + return methodFQN, true, binding.Type + } + + } + } + } + } + + // Try to resolve base through imports + if baseFQN, ok := importMap.Resolve(base); ok { + fullFQN := baseFQN + "." + rest + // Check if it's a known framework + if isKnown, _ := core.IsKnownFramework(fullFQN); isKnown { + return fullFQN, true, nil + } + // Check if it's an ORM pattern (before validateFQN, since ORM methods don't exist in source) + if ormFQN, resolved := resolution.ResolveORMCall(target, currentModule, registry, codeGraph); resolved { + return ormFQN, true, nil + } + // PR #3: Check stdlib registry before user project registry + if typeEngine != nil && typeEngine.StdlibRemote != nil { + if remoteLoader, ok := typeEngine.StdlibRemote.(*cgregistry.StdlibRegistryRemote); ok { + if validateStdlibFQN(fullFQN, remoteLoader) { + return fullFQN, true, nil + } + } + } + if validateFQN(fullFQN, registry) { + return fullFQN, true, nil + } + return fullFQN, false, nil + } + + // Base not in imports - might be module-level access + // Try current module + fullFQN := currentModule + "." + target + if validateFQN(fullFQN, registry) { + return fullFQN, true, nil + } + + // Before giving up, check if it's an ORM pattern (Django, SQLAlchemy, etc.) + // ORM methods are dynamically generated at runtime and won't be in source + if ormFQN, resolved := resolution.ResolveORMCall(target, currentModule, registry, codeGraph); resolved { + return ormFQN, true, nil + } + + // PR #3: Last resort - check if target is a stdlib call (e.g., os.path.join) + // This handles cases where stdlib modules are imported directly (import os.path) + if typeEngine != nil && typeEngine.StdlibRemote != nil { + if remoteLoader, ok := typeEngine.StdlibRemote.(*cgregistry.StdlibRegistryRemote); ok { + if validateStdlibFQN(target, remoteLoader) { + return target, true, nil + } + } + } + + // Can't resolve - return as-is + return target, false, nil +} + +// stdlibModuleAliases maps platform-specific module aliases to their canonical names. +// For example, os.path is posixpath on Unix/Linux/Mac and ntpath on Windows. +var stdlibModuleAliases = map[string]string{ + "os.path": "posixpath", // On POSIX systems (Unix, Linux, macOS) + // Note: On Windows, os.path would be ntpath, but we default to POSIX + // since most development happens on Unix-like systems +} + +// ValidateStdlibFQN checks if a fully qualified name is a stdlib function. +// Supports module.function, module.submodule.function, and module.Class patterns. +// Handles platform-specific module aliases (e.g., os.path -> posixpath). +// Uses lazy loading via remote registry to download modules on-demand. +// +// Examples: +// "os.getcwd" - returns true if os.getcwd exists in stdlib +// "os.path.join" - returns true if posixpath.join exists in stdlib (alias resolution) +// "json.dumps" - returns true if json.dumps exists in stdlib +// +// Parameters: +// - fqn: fully qualified name to check +// - remoteLoader: remote stdlib registry loader +// +// Returns: +// - true if FQN is a stdlib function or class +func ValidateStdlibFQN(fqn string, remoteLoader *cgregistry.StdlibRegistryRemote) bool { + return validateStdlibFQN(fqn, remoteLoader) +} + +// validateStdlibFQN is the internal implementation of ValidateStdlibFQN. +func validateStdlibFQN(fqn string, remoteLoader *cgregistry.StdlibRegistryRemote) bool { + if remoteLoader == nil { + return false + } + + // Split FQN into parts: os.path.join -> ["os", "path", "join"] + parts := strings.Split(fqn, ".") + if len(parts) < 2 { + return false + } + + // Try different module combinations + // For "os.path.join", try: + // 1. module="os.path", function="join" (with alias resolution) + // 2. module="os", function="path.join" + // 3. module="os", function="path" (submodule) + + // Try longest match first (os.path) + for i := len(parts) - 1; i >= 1; i-- { + moduleName := strings.Join(parts[:i], ".") + functionName := parts[i] + + // Check if this module is an alias (e.g., os.path -> posixpath) + if canonicalName, isAlias := stdlibModuleAliases[moduleName]; isAlias { + moduleName = canonicalName + } + + // Lazy load module from remote registry + module, err := remoteLoader.GetModule(moduleName) + if err != nil { + log.Printf("Warning: Failed to load stdlib module %s: %v", moduleName, err) + continue + } + if module == nil { + continue + } + + // Check if it's a function + if _, ok := module.Functions[functionName]; ok { + return true + } + + // Check if it's a class + if _, ok := module.Classes[functionName]; ok { + return true + } + + // Check if it's a constant + if _, ok := module.Constants[functionName]; ok { + return true + } + + // Check if it's an attribute + if _, ok := module.Attributes[functionName]; ok { + return true + } + } + + return false +} + +// ValidateFQN checks if a fully qualified name exists in the registry. +// Handles both module names and function names within modules. +// +// Examples: +// "myapp.utils" - checks if module exists +// "myapp.utils.sanitize" - checks if module "myapp.utils" exists +// +// Parameters: +// - fqn: fully qualified name to validate +// - registry: module registry +// +// Returns: +// - true if FQN is valid (module or function in existing module) +func ValidateFQN(fqn string, registry *core.ModuleRegistry) bool { + return validateFQN(fqn, registry) +} + +// validateFQN is the internal implementation of ValidateFQN. +func validateFQN(fqn string, registry *core.ModuleRegistry) bool { + // Check if it's a module + if _, ok := registry.Modules[fqn]; ok { + return true + } + + // Check if parent module exists (for functions) + // "myapp.utils.sanitize" → check if "myapp.utils" exists + lastDot := strings.LastIndex(fqn, ".") + if lastDot > 0 { + parentModule := fqn[:lastDot] + if _, ok := registry.Modules[parentModule]; ok { + return true + } + } + + return false +} + +// resolveCallTargetLegacy is the old resolution logic without type inference. +// Used for backward compatibility with existing tests. +func resolveCallTargetLegacy(target string, importMap *core.ImportMap, registry *core.ModuleRegistry, currentModule string, codeGraph *graph.CodeGraph) (string, bool) { + // Handle self.method() calls - resolve to current module + if strings.HasPrefix(target, "self.") { + methodName := strings.TrimPrefix(target, "self.") + // Resolve to module.method + moduleFQN := currentModule + "." + methodName + // Validate exists + if validateFQN(moduleFQN, registry) { + return moduleFQN, true + } + // Return unresolved but with module prefix + return moduleFQN, false + } + + // Handle simple names (no dots) + if !strings.Contains(target, ".") { + // Check if it's a Python built-in + if pythonBuiltins[target] { + // Return as builtins.function for pattern matching + return "builtins." + target, true + } + + // Try to resolve through imports + if fqn, ok := importMap.Resolve(target); ok { + // Found in imports - return the FQN + // Check if it's a known framework + if isKnown, _ := core.IsKnownFramework(fqn); isKnown { + return fqn, true + } + // Validate if it exists in registry + resolved := validateFQN(fqn, registry) + return fqn, resolved + } + + // Not in imports - might be in same module + sameLevelFQN := currentModule + "." + target + if validateFQN(sameLevelFQN, registry) { + return sameLevelFQN, true + } + + // Can't resolve - return as-is + return target, false + } + + // Handle qualified names (with dots) + parts := strings.SplitN(target, ".", 2) + base := parts[0] + rest := parts[1] + + // Try to resolve base through imports + if baseFQN, ok := importMap.Resolve(base); ok { + fullFQN := baseFQN + "." + rest + // Check if it's a known framework + if isKnown, _ := core.IsKnownFramework(fullFQN); isKnown { + return fullFQN, true + } + // Check if it's an ORM pattern (before validateFQN, since ORM methods don't exist in source) + if ormFQN, resolved := resolution.ResolveORMCall(target, currentModule, registry, codeGraph); resolved { + return ormFQN, true + } + if validateFQN(fullFQN, registry) { + return fullFQN, true + } + return fullFQN, false + } + + // Base not in imports - might be module-level access + // Try current module + fullFQN := currentModule + "." + target + if validateFQN(fullFQN, registry) { + return fullFQN, true + } + + // Before giving up, check if it's an ORM pattern (Django, SQLAlchemy, etc.) + // ORM methods are dynamically generated at runtime and won't be in source + if ormFQN, resolved := resolution.ResolveORMCall(target, currentModule, registry, codeGraph); resolved { + return ormFQN, true + } + + // Can't resolve - return as-is + return target, false +} + +// DetectPythonVersion infers Python version from project files. +// It checks in order: +// 1. .python-version file +// 2. pyproject.toml [tool.poetry.dependencies] or [project] requires-python +// 3. Defaults to "3.14" +// +// Parameters: +// - projectPath: absolute path to the project root +// +// Returns: +// - Python version string (e.g., "3.14", "3.11", "3.9") +func DetectPythonVersion(projectPath string) string { + return detectPythonVersionInternal(projectPath) +} + +// detectPythonVersionInternal is the implementation - extracted from python_version_detector.go. +func detectPythonVersionInternal(projectPath string) string { + // 1. Check .python-version file + if version := readPythonVersionFile(projectPath); version != "" { + return version + } + + // 2. Check pyproject.toml + if version := parsePyprojectToml(projectPath); version != "" { + return version + } + + // 3. Default to 3.14 + return "3.14" +} + +// Helper functions for DetectPythonVersion. +func readPythonVersionFile(projectPath string) string { + versionFile := filepath.Join(projectPath, ".python-version") + data, err := ReadFileBytes(versionFile) + if err != nil { + return "" + } + + version := strings.TrimSpace(string(data)) + return extractMajorMinor(version) +} + +func parsePyprojectToml(projectPath string) string { + // Import the functionality from cgregistry which has the full implementation + // For now, we'll use a simplified version + tomlFile := filepath.Join(projectPath, "pyproject.toml") + data, err := ReadFileBytes(tomlFile) + if err != nil { + return "" + } + + // Very simple regex-free parsing - just look for version numbers + lines := strings.Split(string(data), "\n") + for _, line := range lines { + // Check for requires-python or python = patterns + if strings.Contains(line, "requires-python") || strings.Contains(line, "python") { + // Extract version number pattern (e.g., 3.11, 3.9, etc.) + parts := strings.Fields(line) + for _, part := range parts { + part = strings.Trim(part, `"'>=<~^`) + if strings.Contains(part, ".") && len(part) >= 3 && len(part) <= 5 { + // Check if it looks like a version (starts with digit) + if len(part) > 0 && part[0] >= '0' && part[0] <= '9' { + return extractMajorMinor(part) + } + } + } + } + } + + return "" +} + +func extractMajorMinor(version string) string { + parts := strings.Split(version, ".") + if len(parts) >= 2 { + return parts[0] + "." + parts[1] + } + if len(parts) == 1 { + return parts[0] + } + return "" +} diff --git a/sourcecode-parser/graph/callgraph/builder/builder_test.go b/sourcecode-parser/graph/callgraph/builder/builder_test.go new file mode 100644 index 00000000..1a3b176e --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/builder_test.go @@ -0,0 +1,273 @@ +package builder + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/registry" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildCallGraph(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + mainPy := filepath.Join(tmpDir, "main.py") + err := os.WriteFile(mainPy, []byte(` +def greet(name): + return f"Hello, {name}" + +def main(): + message = greet("World") + print(message) +`), 0644) + require.NoError(t, err) + + // Parse project + codeGraph := graph.Initialize(tmpDir) + + // Build module registry + moduleRegistry, err := registry.BuildModuleRegistry(tmpDir) + require.NoError(t, err) + + // Build call graph + callGraph, err := BuildCallGraph(codeGraph, moduleRegistry, tmpDir) + require.NoError(t, err) + assert.NotNil(t, callGraph) + + // Verify functions were indexed + assert.NotEmpty(t, callGraph.Functions) + + // Verify edges exist + assert.NotNil(t, callGraph.Edges) + + // Verify reverse edges exist + assert.NotNil(t, callGraph.ReverseEdges) +} + +func TestIndexFunctions(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + mainPy := filepath.Join(tmpDir, "test.py") + err := os.WriteFile(mainPy, []byte(` +def func1(): + pass + +def func2(): + pass + +class MyClass: + def method1(self): + pass +`), 0644) + require.NoError(t, err) + + // Parse project + codeGraph := graph.Initialize(tmpDir) + + // Build module registry + moduleRegistry, err := registry.BuildModuleRegistry(tmpDir) + require.NoError(t, err) + + // Create call graph and index functions + callGraph := core.NewCallGraph() + IndexFunctions(codeGraph, callGraph, moduleRegistry) + + // Verify functions were indexed + assert.NotEmpty(t, callGraph.Functions) + + // Count functions/methods + functionCount := 0 + for _, node := range callGraph.Functions { + if node.Type == "function_definition" || node.Type == "method_declaration" { + functionCount++ + } + } + assert.GreaterOrEqual(t, functionCount, 3, "Should have at least 3 functions/methods") +} + +func TestGetFunctionsInFile(t *testing.T) { + // Create a temporary file + tmpDir := t.TempDir() + testFile := filepath.Join(tmpDir, "test.py") + + err := os.WriteFile(testFile, []byte(` +def func1(): + pass + +def func2(): + pass +`), 0644) + require.NoError(t, err) + + // Parse file + codeGraph := graph.Initialize(tmpDir) + + // Get functions in file + functions := GetFunctionsInFile(codeGraph, testFile) + + // Verify functions were found + assert.NotEmpty(t, functions) + assert.GreaterOrEqual(t, len(functions), 2, "Should find at least 2 functions") +} + +func TestFindContainingFunction(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + testFile := filepath.Join(tmpDir, "test.py") + err := os.WriteFile(testFile, []byte(` +def outer_function(): + x = 1 + y = 2 + return x + y +`), 0644) + require.NoError(t, err) + + // Parse file + codeGraph := graph.Initialize(tmpDir) + + // Get functions + functions := GetFunctionsInFile(codeGraph, testFile) + require.NotEmpty(t, functions) + + // Test finding containing function for a location inside the function + location := core.Location{ + File: testFile, + Line: 3, + Column: 5, // Inside function body + } + + modulePath := "test" + containingFQN := FindContainingFunction(location, functions, modulePath) + + // Should find the outer_function + assert.NotEmpty(t, containingFQN) + assert.Contains(t, containingFQN, "outer_function") +} + +func TestFindContainingFunction_ModuleLevel(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + testFile := filepath.Join(tmpDir, "test.py") + err := os.WriteFile(testFile, []byte(` +MODULE_VAR = 42 + +def my_function(): + pass +`), 0644) + require.NoError(t, err) + + // Parse file + codeGraph := graph.Initialize(tmpDir) + + functions := GetFunctionsInFile(codeGraph, testFile) + + // Test module-level code (column == 1) + location := core.Location{ + File: testFile, + Line: 2, + Column: 1, // Module level + } + + modulePath := "test" + containingFQN := FindContainingFunction(location, functions, modulePath) + + // Should return empty for module-level code + assert.Empty(t, containingFQN) +} + +func TestValidateFQN(t *testing.T) { + moduleRegistry := core.NewModuleRegistry() + + // Add a test module + moduleRegistry.Modules["mymodule"] = "/path/to/mymodule.py" + moduleRegistry.FileToModule["/path/to/mymodule.py"] = "mymodule" + + tests := []struct { + name string + fqn string + expected bool + }{ + {"Valid module FQN", "mymodule.func", true}, + {"Invalid module FQN", "unknownmodule.func", false}, + {"Empty FQN", "", false}, + {"Valid module name without dot", "mymodule", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ValidateFQN(tt.fqn, moduleRegistry) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestDetectPythonVersion(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + // Test with .python-version file + pythonVersionFile := filepath.Join(tmpDir, ".python-version") + err := os.WriteFile(pythonVersionFile, []byte("3.11.0\n"), 0644) + require.NoError(t, err) + + version := DetectPythonVersion(tmpDir) + assert.NotEmpty(t, version) + assert.Contains(t, version, "3.11") +} + +func TestDetectPythonVersion_NoPythonVersionFile(t *testing.T) { + // Create an empty temporary directory + tmpDir := t.TempDir() + + // Should fall back to checking pyproject.toml or default + version := DetectPythonVersion(tmpDir) + // Should return a default version or detect from system + assert.NotEmpty(t, version) +} + +func TestBuildCallGraph_WithEdges(t *testing.T) { + // Create a project with function calls + tmpDir := t.TempDir() + + mainPy := filepath.Join(tmpDir, "main.py") + err := os.WriteFile(mainPy, []byte(` +def helper(): + return 42 + +def caller(): + result = helper() + return result +`), 0644) + require.NoError(t, err) + + // Parse and build call graph + codeGraph := graph.Initialize(tmpDir) + + moduleRegistry, err := registry.BuildModuleRegistry(tmpDir) + require.NoError(t, err) + + callGraph, err := BuildCallGraph(codeGraph, moduleRegistry, tmpDir) + require.NoError(t, err) + + // Verify edges were created + assert.NotEmpty(t, callGraph.Edges) + + // Check that caller has edges to helper + foundEdge := false + for callerFQN, callees := range callGraph.Edges { + if len(callees) > 0 { + foundEdge = true + t.Logf("Function %s calls: %v", callerFQN, callees) + } + } + + assert.True(t, foundEdge, "Expected at least one call edge") +} diff --git a/sourcecode-parser/graph/callgraph/builder/cache.go b/sourcecode-parser/graph/callgraph/builder/cache.go new file mode 100644 index 00000000..37aafba7 --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/cache.go @@ -0,0 +1,88 @@ +package builder + +import ( + "sync" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/resolution" +) + +// ImportMapCache provides thread-safe caching of ImportMap instances. +// It prevents redundant import extraction by caching results keyed by file path. +// +// Thread-safety: +// - All methods are safe for concurrent use +// - Uses RWMutex for optimized read-heavy workloads +// - GetOrExtract handles double-checked locking pattern +type ImportMapCache struct { + cache map[string]*core.ImportMap // Maps file path to ImportMap + mu sync.RWMutex // Protects cache map +} + +// NewImportMapCache creates a new empty import map cache. +func NewImportMapCache() *ImportMapCache { + return &ImportMapCache{ + cache: make(map[string]*core.ImportMap), + } +} + +// Get retrieves an ImportMap from the cache if it exists. +// +// Parameters: +// - filePath: absolute path to the Python file +// +// Returns: +// - ImportMap and true if found in cache, nil and false otherwise +func (c *ImportMapCache) Get(filePath string) (*core.ImportMap, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + + importMap, ok := c.cache[filePath] + return importMap, ok +} + +// Put stores an ImportMap in the cache. +// +// Parameters: +// - filePath: absolute path to the Python file +// - importMap: the extracted ImportMap to cache +func (c *ImportMapCache) Put(filePath string, importMap *core.ImportMap) { + c.mu.Lock() + defer c.mu.Unlock() + + c.cache[filePath] = importMap +} + +// GetOrExtract retrieves an ImportMap from cache or extracts it if not cached. +// This is the main entry point for using the cache. +// +// Parameters: +// - filePath: absolute path to the Python file +// - sourceCode: file contents (only used if extraction needed) +// - registry: module registry for resolving imports +// +// Returns: +// - ImportMap from cache or newly extracted +// - error if extraction fails (cache misses only) +// +// Thread-safety: +// - Multiple goroutines can safely call GetOrExtract concurrently +// - First caller for a file will extract and cache +// - Subsequent callers will get cached result +func (c *ImportMapCache) GetOrExtract(filePath string, sourceCode []byte, registry *core.ModuleRegistry) (*core.ImportMap, error) { + // Try to get from cache (fast path with read lock) + if importMap, ok := c.Get(filePath); ok { + return importMap, nil + } + + // Cache miss - extract imports (expensive operation) + importMap, err := resolution.ExtractImports(filePath, sourceCode, registry) + if err != nil { + return nil, err + } + + // Store in cache for future use + c.Put(filePath, importMap) + + return importMap, nil +} diff --git a/sourcecode-parser/graph/callgraph/builder/cache_test.go b/sourcecode-parser/graph/callgraph/builder/cache_test.go new file mode 100644 index 00000000..1aea3352 --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/cache_test.go @@ -0,0 +1,173 @@ +package builder + +import ( + "sync" + "testing" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewImportMapCache(t *testing.T) { + cache := NewImportMapCache() + assert.NotNil(t, cache) + assert.NotNil(t, cache.cache) + assert.Empty(t, cache.cache) +} + +func TestImportMapCache_GetPut(t *testing.T) { + cache := NewImportMapCache() + filePath := "/test/file.py" + + // Initially should not exist + importMap, ok := cache.Get(filePath) + assert.False(t, ok) + assert.Nil(t, importMap) + + // Put an ImportMap + expectedImportMap := core.NewImportMap(filePath) + expectedImportMap.AddImport("os", "os") + cache.Put(filePath, expectedImportMap) + + // Should now exist + importMap, ok = cache.Get(filePath) + assert.True(t, ok) + assert.Equal(t, expectedImportMap, importMap) +} + +func TestImportMapCache_GetOrExtract_CacheHit(t *testing.T) { + cache := NewImportMapCache() + filePath := "/test/file.py" + + // Pre-populate cache + expectedImportMap := core.NewImportMap(filePath) + expectedImportMap.AddImport("sys", "sys") + cache.Put(filePath, expectedImportMap) + + // GetOrExtract should return cached value without calling ExtractImports + importMap, err := cache.GetOrExtract(filePath, nil, nil) + assert.NoError(t, err) + assert.Equal(t, expectedImportMap, importMap) +} + +func TestImportMapCache_GetOrExtract_CacheMiss(t *testing.T) { + cache := NewImportMapCache() + filePath := "/test/file.py" + + // Simple Python code with imports + sourceCode := []byte(`import os +import sys +from pathlib import Path +`) + + registry := core.NewModuleRegistry() + + // GetOrExtract should extract and cache + importMap, err := cache.GetOrExtract(filePath, sourceCode, registry) + require.NoError(t, err) + assert.NotNil(t, importMap) + + // Should now be in cache + cachedImportMap, ok := cache.Get(filePath) + assert.True(t, ok) + assert.Equal(t, importMap, cachedImportMap) +} + +func TestImportMapCache_ConcurrentAccess(t *testing.T) { + cache := NewImportMapCache() + filePath := "/test/file.py" + + sourceCode := []byte(`import os`) + registry := core.NewModuleRegistry() + + const numGoroutines = 100 + var wg sync.WaitGroup + wg.Add(numGoroutines) + + results := make([]*core.ImportMap, numGoroutines) + + // Multiple goroutines try to get/extract concurrently + for i := 0; i < numGoroutines; i++ { + go func(index int) { + defer wg.Done() + importMap, err := cache.GetOrExtract(filePath, sourceCode, registry) + assert.NoError(t, err) + results[index] = importMap + }(i) + } + + wg.Wait() + + // All results should be non-nil + for i := 0; i < numGoroutines; i++ { + assert.NotNil(t, results[i]) + } + + // All should point to the same cached instance + firstResult := results[0] + for i := 1; i < numGoroutines; i++ { + assert.Equal(t, firstResult, results[i], "Result %d should match first result", i) + } +} + +func TestImportMapCache_ConcurrentPut(t *testing.T) { + cache := NewImportMapCache() + + const numGoroutines = 50 + var wg sync.WaitGroup + wg.Add(numGoroutines) + + // Multiple goroutines put different files concurrently + for i := 0; i < numGoroutines; i++ { + go func(index int) { + defer wg.Done() + filePath := "/test/file" + string(rune('0'+index)) + ".py" + importMap := core.NewImportMap(filePath) + cache.Put(filePath, importMap) + }(i) + } + + wg.Wait() + + // All files should be in cache + for i := 0; i < numGoroutines; i++ { + filePath := "/test/file" + string(rune('0'+i)) + ".py" + importMap, ok := cache.Get(filePath) + assert.True(t, ok, "File %d should be in cache", i) + assert.NotNil(t, importMap) + } +} + +func TestImportMapCache_MultiplePutsForSameFile(t *testing.T) { + cache := NewImportMapCache() + filePath := "/test/file.py" + + // First put + importMap1 := core.NewImportMap(filePath) + importMap1.AddImport("os", "os") + cache.Put(filePath, importMap1) + + // Second put should replace + importMap2 := core.NewImportMap(filePath) + importMap2.AddImport("sys", "sys") + cache.Put(filePath, importMap2) + + // Should get the second one + retrieved, ok := cache.Get(filePath) + assert.True(t, ok) + assert.Equal(t, importMap2, retrieved) + assert.NotEqual(t, importMap1, retrieved) +} + +func TestImportMapCache_EmptyFilePath(t *testing.T) { + cache := NewImportMapCache() + + // Empty file path should work (edge case) + importMap := core.NewImportMap("") + cache.Put("", importMap) + + retrieved, ok := cache.Get("") + assert.True(t, ok) + assert.Equal(t, importMap, retrieved) +} diff --git a/sourcecode-parser/graph/callgraph/builder/doc.go b/sourcecode-parser/graph/callgraph/builder/doc.go new file mode 100644 index 00000000..66a07904 --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/doc.go @@ -0,0 +1,53 @@ +// Package builder provides call graph construction orchestration. +// +// This package ties together all components to build a complete call graph: +// - Module registry (registry package) +// - Type inference (resolution package) +// - Import resolution (resolution package) +// - Call site extraction (extraction package) +// - Advanced resolution (resolution package) +// - Pattern detection (patterns package) +// - Taint analysis (analysis/taint package) +// +// # Basic Usage +// +// // Build from existing code graph +// callGraph, err := builder.BuildCallGraph(codeGraph, moduleRegistry, projectRoot) +// +// # Call Resolution Strategy +// +// The builder uses a multi-strategy approach to resolve function calls: +// 1. Direct import resolution +// 2. Method chaining with type inference +// 3. Self-attribute resolution (self.attr.method) +// 4. Type inference for variable.method() calls +// 5. ORM pattern detection (Django, SQLAlchemy) +// 6. Framework detection (known external frameworks) +// 7. Standard library resolution via remote CDN +// +// Each strategy is tried in order until one succeeds. +// +// # Multi-Pass Architecture +// +// The builder performs multiple passes over the codebase: +// +// Pass 1: Index all function definitions +// Pass 2: Extract return types from all functions +// Pass 3: Extract variable assignments and type bindings +// Pass 4: Extract class attributes +// Pass 5: Resolve call sites and build call graph edges +// Pass 6: Generate taint summaries for security analysis +// +// This multi-pass approach ensures that all necessary type information +// is collected before attempting to resolve call sites. +// +// # Caching +// +// The builder uses ImportMapCache to avoid re-parsing imports from +// the same file multiple times, significantly improving performance. +// +// # Thread Safety +// +// All exported functions in this package are thread-safe. The ImportMapCache +// uses a read-write mutex to allow concurrent reads while ensuring safe writes. +package builder diff --git a/sourcecode-parser/graph/callgraph/builder/helpers.go b/sourcecode-parser/graph/callgraph/builder/helpers.go new file mode 100644 index 00000000..80713e8e --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/helpers.go @@ -0,0 +1,58 @@ +package builder + +import ( + "os" + "path/filepath" + + sitter "github.com/smacker/go-tree-sitter" +) + +// ReadFileBytes reads a file and returns its contents as a byte slice. +// Helper function for reading source code. +// +// Parameters: +// - filePath: path to the file (can be relative or absolute) +// +// Returns: +// - File contents as byte slice +// - error if file cannot be read +func ReadFileBytes(filePath string) ([]byte, error) { + absPath, err := filepath.Abs(filePath) + if err != nil { + return nil, err + } + return os.ReadFile(absPath) +} + +// FindFunctionAtLine searches for a function definition at the specified line number. +// Returns the tree-sitter node for the function, or nil if not found. +// +// This function recursively traverses the AST tree to find a function or method +// definition node at the given line number. +// +// Parameters: +// - root: the root tree-sitter node to search from +// - lineNumber: the line number to search for (1-indexed) +// +// Returns: +// - tree-sitter node for the function definition, or nil if not found +func FindFunctionAtLine(root *sitter.Node, lineNumber uint32) *sitter.Node { + if root == nil { + return nil + } + + // Check if this node is a function definition at the target line + if (root.Type() == "function_definition" || root.Type() == "method_declaration") && + root.StartPoint().Row+1 == lineNumber { + return root + } + + // Recursively search children + for i := 0; i < int(root.ChildCount()); i++ { + if result := FindFunctionAtLine(root.Child(i), lineNumber); result != nil { + return result + } + } + + return nil +} diff --git a/sourcecode-parser/graph/callgraph/builder/helpers_test.go b/sourcecode-parser/graph/callgraph/builder/helpers_test.go new file mode 100644 index 00000000..5a5609fb --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/helpers_test.go @@ -0,0 +1,103 @@ +package builder + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/extraction" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestReadFileBytes(t *testing.T) { + // Create a temporary file + tmpDir := t.TempDir() + testFile := filepath.Join(tmpDir, "test.txt") + testContent := []byte("Hello, World!\nTest content") + + err := os.WriteFile(testFile, testContent, 0644) + require.NoError(t, err) + + // Test reading the file + content, err := ReadFileBytes(testFile) + assert.NoError(t, err) + assert.Equal(t, testContent, content) +} + +func TestReadFileBytes_NonExistent(t *testing.T) { + content, err := ReadFileBytes("/nonexistent/file.txt") + assert.Error(t, err) + assert.Nil(t, content) +} + +func TestFindFunctionAtLine(t *testing.T) { + sourceCode := []byte(` +def function_at_line_2(): + pass + +def function_at_line_5(): + return 42 + +class MyClass: + def method_at_line_9(self): + pass +`) + + tree, err := extraction.ParsePythonFile(sourceCode) + require.NoError(t, err) + defer tree.Close() + + tests := []struct { + name string + lineNumber uint32 + expected bool + }{ + {"Find function at line 2", 2, true}, + {"Find function at line 5", 5, true}, + {"Find method at line 9", 9, true}, + {"No function at line 1", 1, false}, + {"No function at line 3", 3, false}, + {"No function at line 10", 10, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := FindFunctionAtLine(tree.RootNode(), tt.lineNumber) + if tt.expected { + assert.NotNil(t, result, "Expected to find function at line %d", tt.lineNumber) + assert.Equal(t, "function_definition", result.Type()) + } else { + assert.Nil(t, result, "Expected no function at line %d", tt.lineNumber) + } + }) + } +} + +func TestFindFunctionAtLine_NilRoot(t *testing.T) { + result := FindFunctionAtLine(nil, 1) + assert.Nil(t, result) +} + +func TestFindFunctionAtLine_NestedFunctions(t *testing.T) { + sourceCode := []byte(` +def outer(): + def inner(): + pass + return inner +`) + + tree, err := extraction.ParsePythonFile(sourceCode) + require.NoError(t, err) + defer tree.Close() + + // Should find outer function at line 2 + result := FindFunctionAtLine(tree.RootNode(), 2) + assert.NotNil(t, result) + assert.Equal(t, "function_definition", result.Type()) + + // Should find inner function at line 3 + result = FindFunctionAtLine(tree.RootNode(), 3) + assert.NotNil(t, result) + assert.Equal(t, "function_definition", result.Type()) +} diff --git a/sourcecode-parser/graph/callgraph/builder/integration.go b/sourcecode-parser/graph/callgraph/builder/integration.go new file mode 100644 index 00000000..1386335a --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/integration.go @@ -0,0 +1,49 @@ +package builder + +import ( + "time" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/registry" +) + +// BuildCallGraphFromPath is a convenience function that builds a call graph +// from a project directory path. +// +// It performs all three passes: +// 1. Build module registry +// 2. Parse code graph (uses existing parsed graph) +// 3. Build call graph +// +// Parameters: +// - codeGraph: the parsed code graph from graph.Initialize() +// - projectPath: absolute path to project root +// +// Returns: +// - CallGraph: complete call graph with edges and call sites +// - ModuleRegistry: module path mappings +// - error: if any step fails +func BuildCallGraphFromPath(codeGraph *graph.CodeGraph, projectPath string) (*core.CallGraph, *core.ModuleRegistry, error) { + // Pass 1: Build module registry + startRegistry := time.Now() + moduleRegistry, err := registry.BuildModuleRegistry(projectPath) + if err != nil { + return nil, nil, err + } + elapsedRegistry := time.Since(startRegistry) + + // Pass 2-3: Build call graph (includes import extraction and call site extraction) + startCallGraph := time.Now() + callGraph, err := BuildCallGraph(codeGraph, moduleRegistry, projectPath) + if err != nil { + return nil, nil, err + } + elapsedCallGraph := time.Since(startCallGraph) + + // Log timing information + graph.Log("Module registry built in:", elapsedRegistry) + graph.Log("Call graph built in:", elapsedCallGraph) + + return callGraph, moduleRegistry, nil +} diff --git a/sourcecode-parser/graph/callgraph/builder/integration_test.go b/sourcecode-parser/graph/callgraph/builder/integration_test.go new file mode 100644 index 00000000..03ea5742 --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/integration_test.go @@ -0,0 +1,101 @@ +package builder + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildCallGraphFromPath(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + // Create a simple Python file + mainPy := filepath.Join(tmpDir, "main.py") + err := os.WriteFile(mainPy, []byte(` +def greet(name): + return f"Hello, {name}" + +def main(): + message = greet("World") + print(message) + +if __name__ == "__main__": + main() +`), 0644) + require.NoError(t, err) + + // Parse the project to get code graph + codeGraph := graph.Initialize(tmpDir) + assert.NotNil(t, codeGraph) + + // Build call graph from path + callGraph, moduleRegistry, err := BuildCallGraphFromPath(codeGraph, tmpDir) + require.NoError(t, err) + assert.NotNil(t, callGraph) + assert.NotNil(t, moduleRegistry) + + // Verify functions were indexed + assert.NotEmpty(t, callGraph.Functions) + + // Verify module registry was built + assert.NotEmpty(t, moduleRegistry.Modules) +} + +func TestBuildCallGraphFromPath_EmptyProject(t *testing.T) { + // Create an empty temporary directory + tmpDir := t.TempDir() + + // Parse the empty project + codeGraph := graph.Initialize(tmpDir) + + // Build call graph should succeed but be empty + callGraph, moduleRegistry, err := BuildCallGraphFromPath(codeGraph, tmpDir) + require.NoError(t, err) + assert.NotNil(t, callGraph) + assert.NotNil(t, moduleRegistry) + assert.Empty(t, callGraph.Functions) +} + +func TestBuildCallGraphFromPath_WithImports(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + // Create utils.py + utilsPy := filepath.Join(tmpDir, "utils.py") + err := os.WriteFile(utilsPy, []byte(` +def helper(): + return 42 +`), 0644) + require.NoError(t, err) + + // Create main.py that imports utils + mainPy := filepath.Join(tmpDir, "main.py") + err = os.WriteFile(mainPy, []byte(` +from utils import helper + +def main(): + result = helper() + return result +`), 0644) + require.NoError(t, err) + + // Parse the project + codeGraph := graph.Initialize(tmpDir) + + // Build call graph + callGraph, moduleRegistry, err := BuildCallGraphFromPath(codeGraph, tmpDir) + require.NoError(t, err) + assert.NotNil(t, callGraph) + assert.NotNil(t, moduleRegistry) + + // Verify both modules are registered + assert.GreaterOrEqual(t, len(moduleRegistry.Modules), 2) + + // Verify functions from both files are indexed + assert.NotEmpty(t, callGraph.Functions) +} diff --git a/sourcecode-parser/graph/callgraph/builder/taint.go b/sourcecode-parser/graph/callgraph/builder/taint.go new file mode 100644 index 00000000..1fb5a3be --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/taint.go @@ -0,0 +1,96 @@ +package builder + +import ( + "log" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/analysis/taint" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/extraction" +) + +// GenerateTaintSummaries analyzes all Python functions for taint flows. +// This is Pass 5 of the call graph building process. +// +// For each function: +// 1. Extract statements from AST +// 2. Build def-use chains +// 3. Analyze intra-procedural taint +// 4. Store TaintSummary in callGraph.Summaries +// +// Parameters: +// - callGraph: the call graph being built (will be populated with summaries) +// - codeGraph: the parsed AST nodes (currently unused, reserved for future use) +// - registry: module registry (currently unused, reserved for future use) +func GenerateTaintSummaries(callGraph *core.CallGraph, codeGraph *graph.CodeGraph, registry *core.ModuleRegistry) { + _ = codeGraph // Reserved for future use + _ = registry // Reserved for future use + analyzed := 0 + total := len(callGraph.Functions) + + // Iterate over all indexed functions + for funcFQN, funcNode := range callGraph.Functions { + // Read source code for this function's file + sourceCode, err := ReadFileBytes(funcNode.File) + if err != nil { + log.Printf("Warning: failed to read file %s for taint analysis: %v", funcNode.File, err) + continue + } + + // Parse the Python file to get AST + tree, err := extraction.ParsePythonFile(sourceCode) + if err != nil { + log.Printf("Warning: failed to parse %s for taint analysis: %v", funcNode.File, err) + continue + } + + // Find the function node in the AST by line number + functionNode := FindFunctionAtLine(tree.RootNode(), funcNode.LineNumber) + if functionNode == nil { + log.Printf("Warning: could not find function %s at line %d", funcFQN, funcNode.LineNumber) + if tree != nil { + tree.Close() + } + continue + } + + // Step 1: Extract statements from function + statements, err := extraction.ExtractStatements(funcNode.File, sourceCode, functionNode) + if err != nil { + log.Printf("Warning: failed to extract statements from %s: %v", funcFQN, err) + if tree != nil { + tree.Close() + } + continue + } + + // Step 2: Build def-use chains + defUseChain := core.BuildDefUseChains(statements) + + // Step 3: Analyze intra-procedural taint + // For MVP: use empty sources/sinks/sanitizers (will be populated from patterns in PR #6) + summary := taint.AnalyzeIntraProceduralTaint( + funcFQN, + statements, + defUseChain, + []string{}, // sources - will come from patterns + []string{}, // sinks - will come from patterns + []string{}, // sanitizers - will come from patterns + ) + + // Step 4: Store summary + callGraph.Summaries[funcFQN] = summary + + analyzed++ + + // Report progress every 1000 functions + if analyzed%1000 == 0 { + log.Printf("Analyzed %d/%d functions...", analyzed, total) + } + + // Clean up tree-sitter tree + if tree != nil { + tree.Close() + } + } +} diff --git a/sourcecode-parser/graph/callgraph/builder/taint_test.go b/sourcecode-parser/graph/callgraph/builder/taint_test.go new file mode 100644 index 00000000..b533bc8e --- /dev/null +++ b/sourcecode-parser/graph/callgraph/builder/taint_test.go @@ -0,0 +1,126 @@ +package builder + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/registry" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGenerateTaintSummaries(t *testing.T) { + // Create a temporary project + tmpDir := t.TempDir() + + // Create a Python file with potential taint flow + mainPy := filepath.Join(tmpDir, "main.py") + err := os.WriteFile(mainPy, []byte(` +def get_user_input(): + return input("Enter data: ") + +def sanitize(data): + return data.strip() + +def process(data): + clean_data = sanitize(data) + return clean_data + +def unsafe_process(data): + # Direct use without sanitization + exec(data) + +def main(): + user_data = get_user_input() + result = process(user_data) + unsafe_process(user_data) +`), 0644) + require.NoError(t, err) + + // Parse the project + codeGraph := graph.Initialize(tmpDir) + + // Build module registry + moduleRegistry, err := registry.BuildModuleRegistry(tmpDir) + require.NoError(t, err) + + // Build call graph + callGraph, err := BuildCallGraph(codeGraph, moduleRegistry, tmpDir) + require.NoError(t, err) + + // Generate taint summaries + GenerateTaintSummaries(callGraph, codeGraph, moduleRegistry) + + // Verify summaries were generated + assert.NotNil(t, callGraph.Summaries) + + // Check that some functions have taint summaries + foundSummary := false + for funcFQN := range callGraph.Functions { + if summary, exists := callGraph.Summaries[funcFQN]; exists { + foundSummary = true + assert.NotNil(t, summary) + // Summaries should have detections + if len(summary.Detections) > 0 { + t.Logf("Function %s has taint summary: %d detections", + funcFQN, len(summary.Detections)) + } + } + } + + assert.True(t, foundSummary, "Expected at least one taint summary to be generated") +} + +func TestGenerateTaintSummaries_EmptyCallGraph(t *testing.T) { + callGraph := core.NewCallGraph() + codeGraph := &graph.CodeGraph{ + Nodes: make(map[string]*graph.Node), + Edges: make([]*graph.Edge, 0), + } + moduleRegistry := core.NewModuleRegistry() + + // Should not panic with empty inputs + GenerateTaintSummaries(callGraph, codeGraph, moduleRegistry) + + // Summaries should be initialized but empty + assert.NotNil(t, callGraph.Summaries) + assert.Empty(t, callGraph.Summaries) +} + +func TestGenerateTaintSummaries_NoTaintFlow(t *testing.T) { + // Create a temporary project with safe code + tmpDir := t.TempDir() + + mainPy := filepath.Join(tmpDir, "main.py") + err := os.WriteFile(mainPy, []byte(` +def add(a, b): + return a + b + +def multiply(a, b): + return a * b + +def calculate(): + x = add(2, 3) + y = multiply(x, 4) + return y +`), 0644) + require.NoError(t, err) + + // Parse and build call graph + codeGraph := graph.Initialize(tmpDir) + + moduleRegistry, err := registry.BuildModuleRegistry(tmpDir) + require.NoError(t, err) + + callGraph, err := BuildCallGraph(codeGraph, moduleRegistry, tmpDir) + require.NoError(t, err) + + // Generate taint summaries + GenerateTaintSummaries(callGraph, codeGraph, moduleRegistry) + + // Summaries should exist but most won't have sources/sinks + assert.NotNil(t, callGraph.Summaries) +} diff --git a/sourcecode-parser/graph/callgraph/cache_test.go b/sourcecode-parser/graph/callgraph/cache_test.go deleted file mode 100644 index 82b29310..00000000 --- a/sourcecode-parser/graph/callgraph/cache_test.go +++ /dev/null @@ -1,211 +0,0 @@ -package callgraph - -import ( - "sync" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestNewImportMapCache(t *testing.T) { - cache := NewImportMapCache() - assert.NotNil(t, cache) - assert.NotNil(t, cache.cache) - assert.Equal(t, 0, len(cache.cache)) -} - -func TestImportMapCache_GetEmpty(t *testing.T) { - cache := NewImportMapCache() - - importMap, ok := cache.Get("/nonexistent/file.py") - assert.False(t, ok) - assert.Nil(t, importMap) -} - -func TestImportMapCache_PutAndGet(t *testing.T) { - cache := NewImportMapCache() - filePath := "/test/file.py" - - // Create a test ImportMap - testImportMap := NewImportMap(filePath) - testImportMap.AddImport("os", "os") - testImportMap.AddImport("json", "json") - - // Put in cache - cache.Put(filePath, testImportMap) - - // Get from cache - retrieved, ok := cache.Get(filePath) - assert.True(t, ok) - assert.NotNil(t, retrieved) - assert.Equal(t, filePath, retrieved.FilePath) - assert.Equal(t, "os", retrieved.Imports["os"]) - assert.Equal(t, "json", retrieved.Imports["json"]) -} - -func TestImportMapCache_GetOrExtract_CacheHit(t *testing.T) { - cache := NewImportMapCache() - registry := NewModuleRegistry() - filePath := "/test/file.py" - - // Pre-populate cache - cachedImportMap := NewImportMap(filePath) - cachedImportMap.AddImport("cached", "cached.module") - cache.Put(filePath, cachedImportMap) - - // GetOrExtract should return cached version (sourceCode won't be used) - result, err := cache.GetOrExtract(filePath, []byte("# dummy code"), registry) - assert.NoError(t, err) - assert.NotNil(t, result) - assert.Equal(t, "cached.module", result.Imports["cached"]) -} - -func TestImportMapCache_GetOrExtract_CacheMiss(t *testing.T) { - cache := NewImportMapCache() - registry := NewModuleRegistry() - filePath := "../../../test-src/python/imports_test/simple_imports.py" - - // Read test file - sourceCode, err := readFileBytes(filePath) - assert.NoError(t, err) - - // GetOrExtract should extract and cache - result, err := cache.GetOrExtract(filePath, sourceCode, registry) - assert.NoError(t, err) - assert.NotNil(t, result) - - // Verify it's now in cache - cached, ok := cache.Get(filePath) - assert.True(t, ok) - assert.Equal(t, result, cached) -} - -func TestImportMapCache_Concurrent(t *testing.T) { - cache := NewImportMapCache() - registry := NewModuleRegistry() - filePath := "../../../test-src/python/imports_test/simple_imports.py" - - sourceCode, err := readFileBytes(filePath) - assert.NoError(t, err) - - // Launch multiple goroutines to access cache concurrently - const numGoroutines = 10 - var wg sync.WaitGroup - wg.Add(numGoroutines) - - errors := make([]error, numGoroutines) - results := make([]*ImportMap, numGoroutines) - - for i := 0; i < numGoroutines; i++ { - go func(index int) { - defer wg.Done() - result, getErr := cache.GetOrExtract(filePath, sourceCode, registry) - errors[index] = getErr - results[index] = result - }(i) - } - - wg.Wait() - - // All goroutines should succeed - for i := 0; i < numGoroutines; i++ { - assert.NoError(t, errors[i], "Goroutine %d should not error", i) - assert.NotNil(t, results[i], "Goroutine %d should return a result", i) - } - - // All results should be identical (same cached instance or semantically equal) - for i := 1; i < numGoroutines; i++ { - assert.Equal(t, results[0].FilePath, results[i].FilePath) - assert.Equal(t, len(results[0].Imports), len(results[i].Imports)) - } - - // Cache should only contain one entry - assert.Equal(t, 1, len(cache.cache)) -} - -func TestImportMapCache_MultipleFiles(t *testing.T) { - cache := NewImportMapCache() - - file1 := "/test/file1.py" - file2 := "/test/file2.py" - file3 := "/test/file3.py" - - // Add multiple entries - cache.Put(file1, NewImportMap(file1)) - cache.Put(file2, NewImportMap(file2)) - cache.Put(file3, NewImportMap(file3)) - - // Verify all are cached - _, ok1 := cache.Get(file1) - _, ok2 := cache.Get(file2) - _, ok3 := cache.Get(file3) - - assert.True(t, ok1) - assert.True(t, ok2) - assert.True(t, ok3) - assert.Equal(t, 3, len(cache.cache)) -} - -func TestImportMapCache_OverwriteExisting(t *testing.T) { - cache := NewImportMapCache() - filePath := "/test/file.py" - - // Add first version - firstMap := NewImportMap(filePath) - firstMap.AddImport("first", "first.module") - cache.Put(filePath, firstMap) - - // Overwrite with second version - secondMap := NewImportMap(filePath) - secondMap.AddImport("second", "second.module") - cache.Put(filePath, secondMap) - - // Should have second version - result, ok := cache.Get(filePath) - assert.True(t, ok) - assert.Equal(t, "second.module", result.Imports["second"]) - assert.NotContains(t, result.Imports, "first") -} - -func BenchmarkImportMapCache_Get(b *testing.B) { - cache := NewImportMapCache() - filePath := "/test/file.py" - testMap := NewImportMap(filePath) - cache.Put(filePath, testMap) - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - _, _ = cache.Get(filePath) - } -} - -func BenchmarkImportMapCache_Put(b *testing.B) { - cache := NewImportMapCache() - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - filePath := "/test/file.py" - testMap := NewImportMap(filePath) - cache.Put(filePath, testMap) - } -} - -func BenchmarkImportMapCache_ConcurrentGet(b *testing.B) { - cache := NewImportMapCache() - filePath := "/test/file.py" - testMap := NewImportMap(filePath) - cache.Put(filePath, testMap) - - b.ResetTimer() - b.ReportAllocs() - - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - _, _ = cache.Get(filePath) - } - }) -} diff --git a/sourcecode-parser/graph/callgraph/integration.go b/sourcecode-parser/graph/callgraph/integration.go index 1c776a0f..f311aa46 100644 --- a/sourcecode-parser/graph/callgraph/integration.go +++ b/sourcecode-parser/graph/callgraph/integration.go @@ -1,9 +1,9 @@ package callgraph import ( - "time" - "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/builder" + "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/patterns" ) // InitializeCallGraph builds the call graph from a code graph. @@ -25,32 +25,15 @@ import ( // - PatternRegistry: loaded security patterns // - error: if any step fails func InitializeCallGraph(codeGraph *graph.CodeGraph, projectRoot string) (*CallGraph, *ModuleRegistry, *PatternRegistry, error) { - // Pass 1: Build module registry - startRegistry := time.Now() - registry, err := BuildModuleRegistry(projectRoot) - if err != nil { - return nil, nil, nil, err - } - elapsedRegistry := time.Since(startRegistry) - - // Pass 2-3: Build call graph (includes import extraction and call site extraction) - startCallGraph := time.Now() - callGraph, err := BuildCallGraph(codeGraph, registry, projectRoot) + // Use builder package for call graph construction + callGraph, registry, err := builder.BuildCallGraphFromPath(codeGraph, projectRoot) if err != nil { return nil, nil, nil, err } - elapsedCallGraph := time.Since(startCallGraph) // Load security patterns - startPatterns := time.Now() - patternRegistry := NewPatternRegistry() + patternRegistry := patterns.NewPatternRegistry() patternRegistry.LoadDefaultPatterns() - elapsedPatterns := time.Since(startPatterns) - - // Log timing information - graph.Log("Module registry built in:", elapsedRegistry) - graph.Log("Call graph built in:", elapsedCallGraph) - graph.Log("Patterns loaded in:", elapsedPatterns) return callGraph, registry, patternRegistry, nil } diff --git a/sourcecode-parser/graph/callgraph/python_version_detector.go b/sourcecode-parser/graph/callgraph/python_version_detector.go index baf29587..2df05798 100644 --- a/sourcecode-parser/graph/callgraph/python_version_detector.go +++ b/sourcecode-parser/graph/callgraph/python_version_detector.go @@ -1,119 +1,12 @@ package callgraph import ( - "bufio" - "os" - "path/filepath" - "regexp" - "strings" + cgbuilder "github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/builder" ) // detectPythonVersion infers Python version from project files. -// It checks in order: -// 1. .python-version file -// 2. pyproject.toml [tool.poetry.dependencies] or [project] requires-python -// 3. Defaults to "3.14" // -// Parameters: -// - projectPath: absolute path to the project root -// -// Returns: -// - Python version string (e.g., "3.14", "3.11", "3.9") +// Deprecated: This function now delegates to builder.DetectPythonVersion. func detectPythonVersion(projectPath string) string { - // 1. Check .python-version file - if version := readPythonVersionFile(projectPath); version != "" { - return version - } - - // 2. Check pyproject.toml - if version := parsePyprojectToml(projectPath); version != "" { - return version - } - - // 3. Default to 3.14 - return "3.14" -} - -// readPythonVersionFile reads version from .python-version file. -// Format: "3.14.0" or "3.14" (we extract major.minor) -// -// Parameters: -// - projectPath: absolute path to the project root -// -// Returns: -// - Python version string (e.g., "3.14"), or empty string if not found -func readPythonVersionFile(projectPath string) string { - versionFile := filepath.Join(projectPath, ".python-version") - data, err := os.ReadFile(versionFile) - if err != nil { - return "" - } - - version := strings.TrimSpace(string(data)) - return extractMajorMinor(version) -} - -// parsePyprojectToml extracts Python version from pyproject.toml. -// Supports: -// - [project] requires-python = ">=3.11" -// - [tool.poetry.dependencies] python = "^3.11" -// -// Parameters: -// - projectPath: absolute path to the project root -// -// Returns: -// - Python version string (e.g., "3.11"), or empty string if not found -func parsePyprojectToml(projectPath string) string { - tomlFile := filepath.Join(projectPath, "pyproject.toml") - file, err := os.Open(tomlFile) - if err != nil { - return "" - } - defer file.Close() - - // Patterns to match: - // requires-python = ">=3.11" - // python = "^3.11" - // python = "~3.11" - requiresPythonRe := regexp.MustCompile(`requires-python\s*=\s*"[><=~^]*(\d+\.\d+)`) - poetryPythonRe := regexp.MustCompile(`python\s*=\s*"[\^~>=<]*(\d+\.\d+)`) - - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - - // Check requires-python pattern - if matches := requiresPythonRe.FindStringSubmatch(line); len(matches) > 1 { - return matches[1] - } - - // Check poetry python pattern - if matches := poetryPythonRe.FindStringSubmatch(line); len(matches) > 1 { - return matches[1] - } - } - - return "" -} - -// extractMajorMinor extracts major.minor version from full version string. -// Examples: -// - "3.14.0" -> "3.14" -// - "3.11" -> "3.11" -// - "3" -> "3" -// -// Parameters: -// - version: full version string -// -// Returns: -// - major.minor version string, or original if no dots found -func extractMajorMinor(version string) string { - parts := strings.Split(version, ".") - if len(parts) >= 2 { - return parts[0] + "." + parts[1] - } - if len(parts) == 1 { - return parts[0] - } - return "" + return cgbuilder.DetectPythonVersion(projectPath) } diff --git a/sourcecode-parser/graph/callgraph/python_version_detector_test.go b/sourcecode-parser/graph/callgraph/python_version_detector_test.go deleted file mode 100644 index c448c211..00000000 --- a/sourcecode-parser/graph/callgraph/python_version_detector_test.go +++ /dev/null @@ -1,320 +0,0 @@ -package callgraph - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestDetectPythonVersion_PythonVersionFile(t *testing.T) { - // Create temporary directory - tmpDir := t.TempDir() - - // Write .python-version file - versionFile := filepath.Join(tmpDir, ".python-version") - err := os.WriteFile(versionFile, []byte("3.11.5\n"), 0644) - require.NoError(t, err) - - version := detectPythonVersion(tmpDir) - assert.Equal(t, "3.11", version) -} - -func TestDetectPythonVersion_PyprojectToml_RequiresPython(t *testing.T) { - tmpDir := t.TempDir() - - // Write pyproject.toml with requires-python - pyprojectContent := `[project] -name = "test-project" -requires-python = ">=3.10" -` - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - err := os.WriteFile(pyprojectFile, []byte(pyprojectContent), 0644) - require.NoError(t, err) - - version := detectPythonVersion(tmpDir) - assert.Equal(t, "3.10", version) -} - -func TestDetectPythonVersion_PyprojectToml_Poetry(t *testing.T) { - tmpDir := t.TempDir() - - // Write pyproject.toml with poetry dependencies - pyprojectContent := `[tool.poetry] -name = "test-project" - -[tool.poetry.dependencies] -python = "^3.12" -` - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - err := os.WriteFile(pyprojectFile, []byte(pyprojectContent), 0644) - require.NoError(t, err) - - version := detectPythonVersion(tmpDir) - assert.Equal(t, "3.12", version) -} - -func TestDetectPythonVersion_Default(t *testing.T) { - tmpDir := t.TempDir() - - // No version files - should default to 3.14 - version := detectPythonVersion(tmpDir) - assert.Equal(t, "3.14", version) -} - -func TestDetectPythonVersion_PriorityOrder(t *testing.T) { - tmpDir := t.TempDir() - - // Create both .python-version and pyproject.toml - // .python-version should take priority - versionFile := filepath.Join(tmpDir, ".python-version") - err := os.WriteFile(versionFile, []byte("3.9.0"), 0644) - require.NoError(t, err) - - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - pyprojectContent := `[project] -requires-python = ">=3.11" -` - err = os.WriteFile(pyprojectFile, []byte(pyprojectContent), 0644) - require.NoError(t, err) - - version := detectPythonVersion(tmpDir) - assert.Equal(t, "3.9", version, ".python-version should take priority over pyproject.toml") -} - -func TestReadPythonVersionFile_Success(t *testing.T) { - tests := []struct { - name string - content string - expected string - }{ - { - name: "full version", - content: "3.14.0", - expected: "3.14", - }, - { - name: "major.minor only", - content: "3.11", - expected: "3.11", - }, - { - name: "with newline", - content: "3.12.1\n", - expected: "3.12", - }, - { - name: "with spaces", - content: " 3.10.5 ", - expected: "3.10", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tmpDir := t.TempDir() - versionFile := filepath.Join(tmpDir, ".python-version") - err := os.WriteFile(versionFile, []byte(tt.content), 0644) - require.NoError(t, err) - - version := readPythonVersionFile(tmpDir) - assert.Equal(t, tt.expected, version) - }) - } -} - -func TestReadPythonVersionFile_NotFound(t *testing.T) { - tmpDir := t.TempDir() - version := readPythonVersionFile(tmpDir) - assert.Equal(t, "", version) -} - -func TestParsePyprojectToml_RequiresPython(t *testing.T) { - tests := []struct { - name string - content string - expected string - }{ - { - name: "requires-python >=", - content: `[project] -requires-python = ">=3.11" -`, - expected: "3.11", - }, - { - name: "requires-python ==", - content: `[project] -requires-python = "==3.10" -`, - expected: "3.10", - }, - { - name: "requires-python ~=", - content: `[project] -requires-python = "~=3.9" -`, - expected: "3.9", - }, - { - name: "requires-python with spaces", - content: `[project] -requires-python = ">=3.8" -`, - expected: "3.8", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tmpDir := t.TempDir() - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - err := os.WriteFile(pyprojectFile, []byte(tt.content), 0644) - require.NoError(t, err) - - version := parsePyprojectToml(tmpDir) - assert.Equal(t, tt.expected, version) - }) - } -} - -func TestParsePyprojectToml_Poetry(t *testing.T) { - tests := []struct { - name string - content string - expected string - }{ - { - name: "poetry ^", - content: `[tool.poetry.dependencies] -python = "^3.12" -`, - expected: "3.12", - }, - { - name: "poetry ~", - content: `[tool.poetry.dependencies] -python = "~3.11" -`, - expected: "3.11", - }, - { - name: "poetry >=", - content: `[tool.poetry.dependencies] -python = ">=3.10" -`, - expected: "3.10", - }, - { - name: "poetry with spaces", - content: `[tool.poetry.dependencies] -python = "^3.9" -`, - expected: "3.9", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tmpDir := t.TempDir() - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - err := os.WriteFile(pyprojectFile, []byte(tt.content), 0644) - require.NoError(t, err) - - version := parsePyprojectToml(tmpDir) - assert.Equal(t, tt.expected, version) - }) - } -} - -func TestParsePyprojectToml_NotFound(t *testing.T) { - tmpDir := t.TempDir() - version := parsePyprojectToml(tmpDir) - assert.Equal(t, "", version) -} - -func TestParsePyprojectToml_NoVersionInfo(t *testing.T) { - tmpDir := t.TempDir() - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - content := `[project] -name = "test-project" -description = "A test project" -` - err := os.WriteFile(pyprojectFile, []byte(content), 0644) - require.NoError(t, err) - - version := parsePyprojectToml(tmpDir) - assert.Equal(t, "", version) -} - -func TestExtractMajorMinor(t *testing.T) { - tests := []struct { - name string - version string - expected string - }{ - { - name: "full version", - version: "3.14.0", - expected: "3.14", - }, - { - name: "major.minor only", - version: "3.11", - expected: "3.11", - }, - { - name: "major only", - version: "3", - expected: "3", - }, - { - name: "empty string", - version: "", - expected: "", - }, - { - name: "with patch and build", - version: "3.12.5.final.0", - expected: "3.12", - }, - { - name: "single digit", - version: "3.9", - expected: "3.9", - }, - { - name: "double digit minor", - version: "3.10.1", - expected: "3.10", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := extractMajorMinor(tt.version) - assert.Equal(t, tt.expected, result) - }) - } -} - -func TestParsePyprojectToml_ScannerEdgeCases(t *testing.T) { - tmpDir := t.TempDir() - - // Test with file that has matching line but scanner continues - pyprojectFile := filepath.Join(tmpDir, "pyproject.toml") - content := `[project] -name = "test" -# Some comment -requires-python = ">=3.8" -# More content after match -dependencies = ["requests"] -` - err := os.WriteFile(pyprojectFile, []byte(content), 0644) - require.NoError(t, err) - - version := parsePyprojectToml(tmpDir) - assert.Equal(t, "3.8", version) -}