Skip to content
Merged
2 changes: 1 addition & 1 deletion demo-output.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1352,9 +1352,9 @@
error-rule-001: |-
unable to get query info: yaml: unmarshal errors:
line 11: cannot unmarshal !!map into string
test-regex-pattern-00010: failed to perform file content search - could not run grep with provided pattern exit status 2
unmatched:
- file-002
- lang-ref-002
- node-sample-rule-003
- python-sample-rule-003
- test-regex-pattern-00010
30 changes: 30 additions & 0 deletions docs/providers.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,33 @@ The `builtin` provider is configured by default. To override the default config,
The `builtin` provider takes following additional configuration options in `providerSpecificConfig`:

* `tagsFile`: Path to YAML file that contains a list of tags for the application being analyzed

* `excludedDirs`: List of directory paths or patterns to exclude from analysis. These can be absolute paths or relative paths (relative to the `location`). Directory names are also matched anywhere in the tree (e.g., `"node_modules"` will exclude all `node_modules` directories).

The following directories are excluded by default to prevent performance issues and "argument list too long" errors:
- `node_modules` - JavaScript/TypeScript dependencies
- `vendor` - PHP/Go dependencies
- `.git` - Git repository data
- `dist` - Build output
- `build` - Build output
- `target` - Java/Rust build output
- `.venv`, `venv` - Python virtual environments

Additional directories can be excluded by specifying them in the config:

```json
{
"name": "builtin",
"initConfig": [
{
"location": "/path/to/application",
"providerSpecificConfig": {
"excludedDirs": [
"custom-build-dir",
"/absolute/path/to/exclude"
]
}
}
]
}
```
38 changes: 31 additions & 7 deletions provider/internal/builtin/service_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -534,17 +534,41 @@ func (b *builtinServiceClient) performFileContentSearch(pattern string, location
cmd.Stdin = &fileList
currOutput, err = cmd.Output()
default:
args := []string{"-o", "-n", "--with-filename", "-R", "-P", pattern}
b.log.V(7).Info("running grep with args", "args", args)
args = append(args, locations...)
cmd := exec.Command("grep", args...)
// Use xargs to avoid ARG_MAX limits when processing large numbers of files
// This prevents "argument list too long" errors when analyzing projects
// with many files (e.g., node_modules with 30,000+ files)
var fileList bytes.Buffer
for _, f := range currBatch {
fileList.WriteString(f)
fileList.WriteByte('\x00')
}
// Escape pattern for safe shell interpolation
escapedPattern := strings.ReplaceAll(pattern, "'", "'\"'\"'")
cmdStr := fmt.Sprintf(
`xargs -0 grep -o -n --with-filename -P '%s'`,
escapedPattern,
)
b.log.V(7).Info("running grep via xargs", "cmd", cmdStr)
cmd := exec.Command("/bin/sh", "-c", cmdStr)
cmd.Stdin = &fileList
currOutput, err = cmd.Output()
}
if err != nil {
if exitError, ok := err.(*exec.ExitError); ok && exitError.ExitCode() == 1 {
return nil, nil
if exitError, ok := err.(*exec.ExitError); ok {
// Exit code 1: grep found no matches
// Exit code 123: GNU xargs (Linux) exits with 123 when any invocation exits with 1-125
// When grep processes files across multiple xargs batches and some batches have matches
// while others don't, xargs will exit with 123 (not 1). The current code treats this as
// an error and discards the partial results in currOutput, causing false negatives.
// Apply this fix to handle both exit codes correctly:
if exitError.ExitCode() == 1 || exitError.ExitCode() == 123 {
err = nil // Clear error; treat as "no matches in this batch"
// Continue to next batch (don't return!)
}
}
if err != nil {
return nil, fmt.Errorf("could not run grep with provided pattern %+v", err)
}
return nil, fmt.Errorf("could not run grep with provided pattern %+v", err)
}
outputBytes.Write(currOutput)
}
Expand Down
18 changes: 17 additions & 1 deletion provider/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,24 @@ func GetIncludedPathsFromConfig(i InitConfig, allowFilePaths bool) []string {
return validatedPaths
}

// GetExcludedDirsFromConfig returns directories to exclude from analysis.
// It starts with sensible defaults (node_modules, vendor, .git, dist, build, target, venv)
// to prevent "argument list too long" errors when analyzing projects with large
// dependency directories. User-configured excludes are appended to these defaults.
func GetExcludedDirsFromConfig(i InitConfig) []string {
validatedPaths := []string{}
// Default excluded directories prevent issues with large dependency dirs
validatedPaths := []string{
"node_modules", // JavaScript/TypeScript dependencies
"vendor", // PHP/Go dependencies
".git", // Git repository data
"dist", // Common build output directory
"build", // Common build output directory
"target", // Java/Rust build output
".venv", // Python virtual environment
"venv", // Python virtual environment
}

// Add user-configured excludes
if excludedDirs, ok := i.ProviderSpecificConfig[ExcludedDirsConfigKey].([]interface{}); ok {
for _, dir := range excludedDirs {
if expath, ok := dir.(string); ok {
Expand Down
Loading