Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ import (
"google.golang.org/grpc"
)

const hostAttribute = "evergreen.host"
const (
hostAttribute = "evergreen.host"
ps = "ps"
)

var (
shouldExitAttribute = fmt.Sprintf("%s.should_exit", hostAttribute)
Expand Down Expand Up @@ -769,14 +772,19 @@ func (a *Agent) runPreAndMain(ctx context.Context, tc *taskContext) (status stri
tc.setHeartbeatTimeout(heartbeatTimeoutOptions{})
}()

// set up the system stats collector
// Set up the system stats collector.
statsCmds := []string{"uptime", "df -h"}

// Add ps command if configured in YAML or expansion (for backward compatibility) when default ps logging is not disabled.
if psCmd := tc.getPSCommand(); psCmd != "" {
statsCmds = append(statsCmds, psCmd)
}

statsCollector := NewSimpleStatsCollector(
tc.logger,
a.jasper,
globals.DefaultStatsInterval,
"uptime",
"df -h",
"${ps|ps}",
statsCmds...,
)
// Running the `df` command on Unix systems displays inode
// statistics without the `-i` flag by default. However, we need
Expand Down
1 change: 1 addition & 0 deletions agent/internal/task_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ type TaskConfig struct {
TaskGroup *model.TaskGroup
CommandCleanups []CommandCleanup
MaxExecTimeoutSecs int
PSLoggingDisabled bool

// PatchOrVersionDescription holds the description of a patch or
// message of a version to be used in the otel attributes.
Expand Down
3 changes: 3 additions & 0 deletions agent/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ func (sc *StatsCollector) expandCommands(exp util.Expansions) {
sc.logger.System().Warning(errors.Wrapf(err, "expanding stats command '%s'", cmd))
continue
}
if strings.TrimSpace(expanded) == "" {
continue
}
expandedCmds = append(expandedCmds, expanded)
}
sc.Cmds = expandedCmds
Expand Down
47 changes: 47 additions & 0 deletions agent/task_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,52 @@ func (tc *taskContext) getExecTimeout() time.Duration {
return globals.DefaultExecTimeout
}

// getPSCommand retrieves the ps command from the task configuration following the priority order:
// 1. Build variant task-level PS
// 2. Project task-level PS
// 3. Project-level PS
// 4. Expansion fallback (only when PSLoggingDisabled=false for backward compatibility)
// The value is expanded to support users specifying expansions in YAML (e.g., ps: "${my_ps}").
func (tc *taskContext) getPSCommand() string {
tc.RLock()
defer tc.RUnlock()

// Check build variant task-level PS (highest priority).
bvTask := tc.taskConfig.Project.FindBuildVariantTaskUnit(
tc.taskConfig.Task.BuildVariant,
tc.taskConfig.Task.DisplayName,
)
if bvTask != nil && bvTask.PS != nil {
ps, _ := tc.taskConfig.Expansions.ExpandString(*bvTask.PS)
return ps
}

// Check project task-level PS (second priority).
projectTask := tc.taskConfig.Project.FindProjectTask(tc.taskConfig.Task.DisplayName)
if projectTask != nil && projectTask.PS != nil {
ps, _ := tc.taskConfig.Expansions.ExpandString(*projectTask.PS)
return ps
}

// Check project-level PS (third priority).
if tc.taskConfig.Project.PS != "" {
ps, _ := tc.taskConfig.Expansions.ExpandString(tc.taskConfig.Project.PS)
return ps
}

// For backward compatibility: when PSLoggingDisabled=false, fall back to ps expansion.
// This allows distro/build variant expansions to work for existing projects.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait sorry just to clarify my understanding -- when we do disable ps logging, does that mean we won't be accepting distro-level expansions anymore?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, once it's disabled, there will be no more automatic ps logging, including not accepting distro-level expansions. Users will have to set ps to ${ps | ps} to get what the default is now.

if !tc.taskConfig.PSLoggingDisabled {
if psExpansion := tc.taskConfig.Expansions.Get("ps"); psExpansion != "" {
return psExpansion
}
// Default to "ps" when PSLoggingDisabled is false and no expansion is set.
return "ps"
}

return ""
}

// makeTaskConfig fetches task configuration data required to run the task from the API server.
func (a *Agent) makeTaskConfig(ctx context.Context, tc *taskContext) (*internal.TaskConfig, error) {
if tc.taskConfig != nil {
Expand Down Expand Up @@ -407,6 +453,7 @@ func (a *Agent) makeTaskConfig(ctx context.Context, tc *taskContext) (*internal.
}
taskConfig.TaskOutput = a.opts.SetupData.TaskOutput
taskConfig.MaxExecTimeoutSecs = a.opts.SetupData.MaxExecTimeoutSecs
taskConfig.PSLoggingDisabled = a.opts.SetupData.PSLoggingDisabled

// Set AWS credentials for task output buckets.
awsCreds := pail.CreateAWSStaticCredentials(taskConfig.TaskOutput.Key, taskConfig.TaskOutput.Secret, "")
Expand Down
1 change: 1 addition & 0 deletions apimodels/agent_models.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ type AgentSetupData struct {
TaskOutput evergreen.S3Credentials `json:"task_output"`
TraceCollectorEndpoint string `json:"trace_collector_endpoint"`
MaxExecTimeoutSecs int `json:"max_exec_timeout_secs"`
PSLoggingDisabled bool `json:"ps_logging_disabled"`
}

// NextTaskResponse represents the response sent back when an agent asks for a next task
Expand Down
2 changes: 1 addition & 1 deletion config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ var (

// Agent version to control agent rollover. The format is the calendar date
// (YYYY-MM-DD).
AgentVersion = "2026-02-06a"
AgentVersion = "2026-02-07"
)

const (
Expand Down
1 change: 1 addition & 0 deletions config_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ var (
s3LifecycleSyncDisabledKey = bsonutil.MustHaveTag(ServiceFlags{}, "S3LifecycleSyncDisabled")
useGitForGitHubFilesDisabledKey = bsonutil.MustHaveTag(ServiceFlags{}, "UseGitForGitHubFilesDisabled")
useMergeQueuePathFilteringDisabledKey = bsonutil.MustHaveTag(ServiceFlags{}, "UseMergeQueuePathFilteringDisabled")
psLoggingDisabledKey = bsonutil.MustHaveTag(ServiceFlags{}, "PSLoggingDisabled")

// ContainerPoolsConfig keys
poolsKey = bsonutil.MustHaveTag(ContainerPoolsConfig{}, "Pools")
Expand Down
2 changes: 2 additions & 0 deletions config_serviceflags.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type ServiceFlags struct {
S3LifecycleSyncDisabled bool `bson:"s3_lifecycle_sync_disabled" json:"s3_lifecycle_sync_disabled"`
UseGitForGitHubFilesDisabled bool `bson:"use_git_for_github_files_disabled" json:"use_git_for_github_files_disabled"`
UseMergeQueuePathFilteringDisabled bool `bson:"use_merge_queue_path_filtering_disabled" json:"use_merge_queue_path_filtering_disabled"`
PSLoggingDisabled bool `bson:"ps_logging_disabled" json:"ps_logging_disabled"`

// Notification Flags
EventProcessingDisabled bool `bson:"event_processing_disabled" json:"event_processing_disabled"`
Expand Down Expand Up @@ -102,6 +103,7 @@ func (c *ServiceFlags) Set(ctx context.Context) error {
s3LifecycleSyncDisabledKey: c.S3LifecycleSyncDisabled,
useGitForGitHubFilesDisabledKey: c.UseGitForGitHubFilesDisabled,
useMergeQueuePathFilteringDisabledKey: c.UseMergeQueuePathFilteringDisabled,
psLoggingDisabledKey: c.PSLoggingDisabled,
}}), "updating config section '%s'", c.SectionId(),
)
}
Expand Down
126 changes: 113 additions & 13 deletions docs/Project-Configuration/Project-Configuration-Files.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ Fields:
non-module related build variant fields ([context](../decisions/2024-07-18_allow_module_expansions)).
- `tasks`: a list of tasks to run, referenced either by task name or by tags.
Tasks listed here can also include other task-level fields, such as
`batchtime`, `cron`, `activate`, `depends_on`, `stepback`, and `run_on`.
`batchtime`, `cron`, `activate`, `depends_on`, `stepback`, `run_on`, and `ps`.
We can also [define when a task will run](#controlling-when-tasks-and-variants-run). If there are
conflicting settings definitions at different levels, the order of priority
is defined [here](#task-fields-override-hierarchy).
Expand Down Expand Up @@ -557,13 +557,13 @@ Parameters:
**Exec timeout: exec_timeout_secs**
You can customize the points at which the "timeout" conditions are
triggered. To cause a task to stop (and fail) if it doesn't complete
within an allotted time, set the key `exec_timeout_secs` on the overall project
or on a specific task to set the maximum allowed length of execution time. Exec timeout only
within an allotted time, set the key `exec_timeout_secs` on the overall project,
on a specific task, or on a specific task within a build variant to set the maximum allowed length of execution time. Exec timeout only
applies to commands that run in `pre`, `setup_group`, `setup_task`, and the main
task commands; it does not apply to the `post`, `teardown_task`, and
`teardown_group` blocks. This timeout defaults to 6 hours, and cannot be set above 24 hours.
`exec_timeout_secs` can only be set on the project or on a task as seen in below example.
It cannot be set on functions or build variant tasks.
`exec_timeout_secs` can be set on the project, on a task, or on a task within a build variant as seen in below example.
It cannot be set on functions.

You can also set `exec_timeout_secs` using [timeout.update](Project-Commands#timeoutupdate).

Expand Down Expand Up @@ -594,16 +594,24 @@ buildvariants:
- localtestdistro
tasks:
- name: compile
- name: test
exec_timeout_secs: 30 ## override the project and task level exec_timeout_secs for this variant's test task

tasks:
name: compile
commands:
- command: shell.exec
timeout_secs: 10 ## override the project level timeout_secs defined above and force this command to fail if it stays "idle" for 10 seconds or more
exec_timeout_secs: 20 ## will override the project level exec_timeout_secs defined above for this task
params:
script: |
sleep 1000
- name: compile
commands:
- command: shell.exec
timeout_secs: 10 ## override the project level timeout_secs defined above and force this command to fail if it stays "idle" for 10 seconds or more
params:
script: |
sleep 1000
- name: test
exec_timeout_secs: 20 ## will override the project level exec_timeout_secs defined above for this task
commands:
- command: shell.exec
params:
script: |
echo "running tests"
```

### Controlling When Tasks and Variants Run
Expand Down Expand Up @@ -1156,6 +1164,98 @@ To disable the OOM tracker, add the following to the top-level of your yaml.
oom_tracker: false
```

### Process Diagnostics: ps

By default, Evergreen logs process information every 60 seconds during task execution using the ps expansion if defined (in distro settings, project variables, or in the project yaml) or `ps` as the default. This default behavior will be deprecated soon, and process logging will become opt-in (disabled by default unless explicitly configured and will no longer default to the ps expansion or `ps`).

You can customize the process logging command by setting the `ps` field at multiple configuration levels. There is currently no option to opt out, but once default ps logging is deprecated, you will be able to disable process logging, by either not setting it anywhere (the default) or by setting `ps` to an empty string. When enabled, the specified command runs every 60 seconds.

The `ps` field follows a priority order (from highest to lowest):

1. **Build variant task level** - Overrides all other settings
2. **Project task level** - Overrides project-level and lower settings
3. **Project level** - Overrides build variant expansions and default
4. **Default** - Currently defaults to the ps expansion or `"ps"` (will be deprecated to no logging)

**Note about distro and build variant expansions:** When the default ps logging behavior is deprecated (in the future), distro, project variable and build variant `ps` expansions will be ignored unless you have an explicit `ps` setting at the project, task, or build variant task level. To use an expansion, it would need to be explicitely referenced with `ps: "${my_custom_ps}"` at the desired level (as outlined above).

**Project level** (overrides build variant expansions and default):

```yaml
ps: "ps -o pid" # Enable for all tasks

tasks:
- name: my_task
commands:
- command: shell.exec
params:
script: echo "Running with ps logging"
```

**Task level** (overrides project level, build variant expansions, and default):

```yaml
tasks:
- name: task_with_custom_ps
ps: "ps -o pid,tty,time,comm,args" # Custom ps command for this task
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry if i missed where we did this, but can we add these to the list of available task and variant fields in the docs as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have a consolidated list of task fields (I can create a ticket to add it if you'd like).

The Build Variants Fields section is for fields that can be set directly on the build variant itself (like name, display_name, run_on, etc). This can only be set on the BuildVariantTaskUnit, so I instead added a mention to the tasks field in that list.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah gotcha. What's the logic for not having this set at the variant level, out of curiosity, just not needed, or bc we want to reduce how often we're doing overrides?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

love the big new example also!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to reduce how often we're doing overrides, we can add it later if there is interest.

commands:
- command: shell.exec
params:
script: echo "Running with custom ps"

- name: task_without_ps
ps: "" # Explicitly disable ps logging
commands:
- command: shell.exec
params:
script: echo "No ps logging"
```

**Build variant task level** (highest priority, overrides task level, project level, build variant expansions, and default):

```yaml
ps: "ps -o pid" # Project-level

tasks:
- name: my_task_1
ps: "ps -o pid,tty,time,comm,args" # Task-level
commands:
- command: shell.exec
params:
script: echo "Task execution"

- name: my_task_2
ps: "ps -o pid,tty,time,comm,args" # Task-level
commands:
- command: shell.exec
params:
script: echo "Custom ps task"

- name: other_task
commands:
- command: shell.exec
params:
script: echo "Task without explicit ps"

- name: task_with_expansion
ps: "${my_custom_ps}" # Reference expansion defined in build variant
commands:
- command: shell.exec
params:
script: echo "Task using expansion"

buildvariants:
- name: ubuntu2204
expansions:
my_custom_ps: "ps -o pid,user,comm" # Define custom ps command as expansion
tasks:
- name: my_task_1
ps: "ps -o pid,tty,time" # Build variant task-level: overrides task and project level ps.
- name: my_task_2 # Uses task-level "ps -o pid,tty,time,comm,args" since there is no build variant task-level override.
- name: other_task # Uses project-level "ps -o pid" since no task-level or build variant task-level ps is set.
- name: task_with_expansion # Uses "ps -o pid,user,comm" from the my_custom_ps expansion defined at task level.
```

### Matrix Variant Definition

The matrix syntax is deprecated in favor of the
Expand Down
Loading
Loading