Skip to content

Commit 0ccaeb2

Browse files
shivasuryaclaude
andcommitted
refactor(callgraph): Create analysis/taint, extraction packages and complete registry
This PR completes the registry layer with stdlib support and creates the foundation for taint analysis by establishing the analysis/taint and extraction packages. Package Structure Created: - analysis/taint: Intra-procedural taint analysis (15 tests, 94.8% coverage) - extraction: AST statement extraction (46 tests, 89.1% coverage) - registry: Enhanced with stdlib loader and remote CDN support Files Migrated: - taint.go → analysis/taint/analyzer.go - statement_extraction.go → extraction/statements.go - stdlib_registry_loader.go → registry/stdlib_loader.go - stdlib_registry_remote.go → registry/stdlib_remote.go Type Aliases: All original files updated with backward-compatible type aliases to maintain zero breaking changes. Bug Fix: Enhanced matchesFunctionName in analysis/taint/analyzer.go to handle parentheses and support suffix/prefix matching, fixing test failures in intra-procedural taint detection. Verification: gradle clean buildGo (SUCCESS), gradle testGo (all pass), gradle lintGo (0 issues), coverage 89.1-94.8% (exceeds 85% target). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 64f8c7a commit 0ccaeb2

14 files changed

Lines changed: 1427 additions & 1259 deletions
Lines changed: 355 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
package taint
2+
3+
import (
4+
"strings"
5+
6+
"github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core"
7+
)
8+
9+
// variableTaintInfo tracks taint status for a variable (internal type).
10+
type variableTaintInfo struct {
11+
Source string // Source function that introduced taint
12+
Confidence float64 // Confidence level (1.0 = direct, <1.0 = propagated)
13+
SourceLine uint32 // Line where taint was introduced
14+
}
15+
16+
// TaintState tracks taint information for all variables in a function.
17+
type TaintState struct {
18+
Variables map[string]*variableTaintInfo
19+
}
20+
21+
// NewTaintState creates an empty taint state.
22+
func NewTaintState() *TaintState {
23+
return &TaintState{
24+
Variables: make(map[string]*variableTaintInfo),
25+
}
26+
}
27+
28+
// SetTainted marks a variable as tainted.
29+
func (ts *TaintState) SetTainted(varName, source string, confidence float64, sourceLine uint32) {
30+
ts.Variables[varName] = &variableTaintInfo{
31+
Source: source,
32+
Confidence: confidence,
33+
SourceLine: sourceLine,
34+
}
35+
}
36+
37+
// SetUntainted marks a variable as untainted (sanitized).
38+
func (ts *TaintState) SetUntainted(varName string) {
39+
delete(ts.Variables, varName)
40+
}
41+
42+
// GetTaintInfo returns taint information for a variable.
43+
// Returns nil if variable has no taint information.
44+
func (ts *TaintState) GetTaintInfo(varName string) *variableTaintInfo {
45+
return ts.Variables[varName]
46+
}
47+
48+
// IsTainted returns true if the variable is tainted.
49+
func (ts *TaintState) IsTainted(varName string) bool {
50+
return ts.Variables[varName] != nil
51+
}
52+
53+
// AnalyzeIntraProceduralTaint performs forward taint analysis on a function.
54+
// Returns a TaintSummary with detections of taint flows.
55+
func AnalyzeIntraProceduralTaint(
56+
functionFQN string,
57+
statements []*core.Statement,
58+
defUseChain *core.DefUseChain,
59+
sources []string,
60+
sinks []string,
61+
sanitizers []string,
62+
) *core.TaintSummary {
63+
taintState := NewTaintState()
64+
summary := core.NewTaintSummary(functionFQN)
65+
66+
// Forward data flow analysis
67+
for _, stmt := range statements {
68+
// Check if this is a SOURCE
69+
if isSource(stmt, sources) {
70+
// Mark LHS as tainted
71+
if stmt.Def != "" {
72+
taintState.SetTainted(stmt.Def, stmt.CallTarget, 1.0, stmt.LineNumber)
73+
74+
// Add to TaintedVars
75+
summary.AddTaintedVar(stmt.Def, &core.TaintInfo{
76+
SourceLine: stmt.LineNumber,
77+
SourceVar: stmt.Def,
78+
Confidence: 1.0,
79+
})
80+
}
81+
continue
82+
}
83+
84+
// Check if this is a SANITIZER
85+
if isSanitizer(stmt, sanitizers) {
86+
handleSanitizer(stmt, taintState)
87+
continue
88+
}
89+
90+
// Handle ASSIGNMENT propagation
91+
if stmt.Type == core.StatementTypeAssignment {
92+
propagateAssignment(stmt, taintState, summary)
93+
}
94+
95+
// Handle CALL propagation
96+
if stmt.Type == core.StatementTypeCall || stmt.CallTarget != "" {
97+
propagateCall(stmt, taintState, summary)
98+
}
99+
100+
// Check if this is a SINK
101+
if isSink(stmt, sinks) {
102+
// Check if any argument is tainted
103+
for _, usedVar := range stmt.Uses {
104+
if taintInfo := taintState.GetTaintInfo(usedVar); taintInfo != nil {
105+
// Create detection
106+
detection := &core.TaintInfo{
107+
SourceLine: taintInfo.SourceLine,
108+
SourceVar: usedVar,
109+
SinkLine: stmt.LineNumber,
110+
SinkCall: stmt.CallTarget,
111+
Confidence: taintInfo.Confidence,
112+
}
113+
summary.AddDetection(detection)
114+
}
115+
}
116+
}
117+
}
118+
119+
return summary
120+
}
121+
122+
// propagateAssignment propagates taint through assignments: y = x.
123+
func propagateAssignment(stmt *core.Statement, taintState *TaintState, summary *core.TaintSummary) {
124+
if stmt.Def == "" {
125+
return
126+
}
127+
128+
// Check if any variable in RHS (Uses) is tainted
129+
for _, usedVar := range stmt.Uses {
130+
if taintInfo := taintState.GetTaintInfo(usedVar); taintInfo != nil {
131+
// Propagate taint from RHS to LHS (no decay for simple assignment)
132+
taintState.SetTainted(stmt.Def, taintInfo.Source, taintInfo.Confidence, taintInfo.SourceLine)
133+
134+
// Add to summary
135+
summary.AddTaintedVar(stmt.Def, &core.TaintInfo{
136+
SourceLine: taintInfo.SourceLine,
137+
SourceVar: stmt.Def,
138+
Confidence: taintInfo.Confidence,
139+
})
140+
return
141+
}
142+
}
143+
}
144+
145+
// propagateCall propagates taint through function calls: y = func(x).
146+
func propagateCall(stmt *core.Statement, taintState *TaintState, summary *core.TaintSummary) {
147+
if stmt.Def == "" {
148+
return
149+
}
150+
151+
// Check if call is a non-propagator (len, type, etc.)
152+
if isNonPropagator(stmt.CallTarget) {
153+
return
154+
}
155+
156+
// Check if any argument is tainted
157+
var taintedArg *variableTaintInfo
158+
for _, usedVar := range stmt.Uses {
159+
if info := taintState.GetTaintInfo(usedVar); info != nil {
160+
taintedArg = info
161+
break
162+
}
163+
}
164+
165+
if taintedArg == nil {
166+
return
167+
}
168+
169+
// Determine confidence decay based on call type
170+
decay := 0.7 // Default: conservative propagation for stdlib/third-party
171+
172+
// Propagate with decay
173+
newConfidence := taintedArg.Confidence * decay
174+
taintState.SetTainted(stmt.Def, taintedArg.Source, newConfidence, taintedArg.SourceLine)
175+
176+
// Add to summary
177+
summary.AddTaintedVar(stmt.Def, &core.TaintInfo{
178+
SourceLine: taintedArg.SourceLine,
179+
SourceVar: stmt.Def,
180+
Confidence: newConfidence,
181+
})
182+
}
183+
184+
// handleSanitizer handles sanitizer calls (removes taint).
185+
func handleSanitizer(stmt *core.Statement, taintState *TaintState) {
186+
if stmt.Def != "" {
187+
taintState.SetUntainted(stmt.Def)
188+
}
189+
}
190+
191+
// isSource checks if statement is a taint source.
192+
func isSource(stmt *core.Statement, sources []string) bool {
193+
if stmt.CallTarget == "" {
194+
return false
195+
}
196+
197+
for _, source := range sources {
198+
if matchesFunctionName(stmt.CallTarget, source) {
199+
return true
200+
}
201+
}
202+
203+
// Check hardcoded stdlib sources
204+
return isStdlibSource(stmt.CallTarget)
205+
}
206+
207+
// isSink checks if statement is a taint sink.
208+
func isSink(stmt *core.Statement, sinks []string) bool {
209+
if stmt.CallTarget == "" {
210+
return false
211+
}
212+
213+
for _, sink := range sinks {
214+
if matchesFunctionName(stmt.CallTarget, sink) {
215+
return true
216+
}
217+
}
218+
219+
return false
220+
}
221+
222+
// isSanitizer checks if statement is a sanitizer.
223+
func isSanitizer(stmt *core.Statement, sanitizers []string) bool {
224+
if stmt.CallTarget == "" {
225+
return false
226+
}
227+
228+
for _, sanitizer := range sanitizers {
229+
if matchesFunctionName(stmt.CallTarget, sanitizer) {
230+
return true
231+
}
232+
}
233+
234+
// Check hardcoded stdlib sanitizers
235+
return isStdlibSanitizer(stmt.CallTarget)
236+
}
237+
238+
// Hardcoded stdlib sources (Tier 2).
239+
var stdlibSources = map[string][]string{
240+
"os": {"getenv", "environ"},
241+
"sys": {"argv"},
242+
"socket": {"recv", "recvfrom", "recvmsg"},
243+
}
244+
245+
// Hardcoded stdlib sanitizers (Tier 2).
246+
var stdlibSanitizers = map[string][]string{
247+
"html": {"escape"},
248+
"urllib.parse": {"quote", "quote_plus"},
249+
"shlex": {"quote"},
250+
}
251+
252+
// Hardcoded non-propagators (Tier 2).
253+
var stdlibNonPropagators = map[string][]string{
254+
"builtins": {"len", "type", "isinstance", "hasattr", "id", "bool", "int", "str", "float"},
255+
"os.path": {"exists", "isfile", "isdir", "getsize", "isabs"},
256+
}
257+
258+
// isStdlibSource checks if call is a known stdlib source.
259+
func isStdlibSource(callTarget string) bool {
260+
module, funcName := splitModuleFunction(callTarget)
261+
if sources, ok := stdlibSources[module]; ok {
262+
for _, s := range sources {
263+
if s == funcName {
264+
return true
265+
}
266+
}
267+
}
268+
return false
269+
}
270+
271+
// isStdlibSanitizer checks if call is a known stdlib sanitizer.
272+
func isStdlibSanitizer(callTarget string) bool {
273+
module, funcName := splitModuleFunction(callTarget)
274+
if sanitizers, ok := stdlibSanitizers[module]; ok {
275+
for _, s := range sanitizers {
276+
if s == funcName {
277+
return true
278+
}
279+
}
280+
}
281+
return false
282+
}
283+
284+
// isNonPropagator checks if function doesn't propagate taint.
285+
func isNonPropagator(callTarget string) bool {
286+
module, funcName := splitModuleFunction(callTarget)
287+
288+
// Check exact module.function match
289+
if funcs, ok := stdlibNonPropagators[module]; ok {
290+
for _, f := range funcs {
291+
if f == funcName {
292+
return true
293+
}
294+
}
295+
}
296+
297+
// Check builtins (no module prefix)
298+
if module == "" {
299+
if funcs, ok := stdlibNonPropagators["builtins"]; ok {
300+
for _, f := range funcs {
301+
if f == callTarget {
302+
return true
303+
}
304+
}
305+
}
306+
}
307+
308+
return false
309+
}
310+
311+
// splitModuleFunction splits "os.path.join" into ("os.path", "join").
312+
func splitModuleFunction(callTarget string) (module, function string) {
313+
lastDot := strings.LastIndex(callTarget, ".")
314+
if lastDot == -1 {
315+
return "", callTarget // No module (builtin)
316+
}
317+
return callTarget[:lastDot], callTarget[lastDot+1:]
318+
}
319+
320+
// matchesFunctionName checks if a call target matches a function name pattern.
321+
// Supports exact matches, suffix matches (e.g., "builtins.eval" matches "eval"),
322+
// and handles parentheses (e.g., "input()" matches "input").
323+
func matchesFunctionName(callTarget, pattern string) bool {
324+
// Strip parentheses from call target if present
325+
cleanTarget := callTarget
326+
if idx := strings.Index(callTarget, "("); idx >= 0 {
327+
cleanTarget = callTarget[:idx]
328+
}
329+
330+
// Exact match: "eval" == "eval"
331+
if cleanTarget == pattern {
332+
return true
333+
}
334+
335+
// Suffix match: "builtins.eval" ends with ".eval"
336+
if strings.HasSuffix(cleanTarget, "."+pattern) {
337+
return true
338+
}
339+
340+
// Prefix match: "request.GET.get" starts with "request.GET."
341+
if strings.HasPrefix(cleanTarget, pattern+".") {
342+
return true
343+
}
344+
345+
// Extract last component and compare
346+
lastDot := strings.LastIndex(cleanTarget, ".")
347+
if lastDot >= 0 && lastDot < len(cleanTarget)-1 {
348+
lastComponent := cleanTarget[lastDot+1:]
349+
if lastComponent == pattern {
350+
return true
351+
}
352+
}
353+
354+
return false
355+
}

0 commit comments

Comments
 (0)