Skip to content

Commit 4344049

Browse files
authored
[NDMII-3603] Dynamic batch size for SNMP check (#41689)
### What does this PR do? This PR adds implements a dynamic batch size for the SNMP check, it works as follows: At start, the batch size is the configured batch size (or default if not configured). For a fetch in a SNMP check: - If the fetch fails: - The batch size is divided by a [`decreaseFactor`](https://github.com/DataDog/datadog-agent/pull/41689/files#diff-7e1c5e2d2bee2b336f1316cf0bba6c2f50d17461d9ff6b8997d4c8ef32ed202aR16), we retry the fetch with the new batch size in the same check. If it still fails, we decrease the batch size again.. until it reaches 1. - If the fetch succeeds: - The batch size is increased by an [`increaseValue`](https://github.com/DataDog/datadog-agent/pull/41689/files#diff-7e1c5e2d2bee2b336f1316cf0bba6c2f50d17461d9ff6b8997d4c8ef32ed202aR16) for the next check. The increased batch size cannot be more than the configured batch size. We also keep a map that associates the number of fetch failures count by batch size. This map is used such that if a batch size have failed too much time ([`maxFailuresPerWindow`](https://github.com/DataDog/datadog-agent/pull/41689/files#diff-7e1c5e2d2bee2b336f1316cf0bba6c2f50d17461d9ff6b8997d4c8ef32ed202aR19)) during a certain time window ([failuresWindowDuration](https://github.com/DataDog/datadog-agent/pull/41689/files#diff-7e1c5e2d2bee2b336f1316cf0bba6c2f50d17461d9ff6b8997d4c8ef32ed202aR18)), we do not increase to this batch size, and keep the current one. I think this is useful to not always retry batch sizes that will always fail for some devices, but also do retry it after the time window in case some devices didn't have enough capacity only for temporary moment. I also chose to separate the batch size for each SNMP operation (Get, GetBulk, GetNext), because I think it's a possibility that a certain batch size will work for one operation but not for an other one (I do not have proof of that), let me know if you think keeping a single batch size for all operation would be better. ### Motivation ### Describe how you validated your changes - [Unit tests](https://github.com/DataDog/datadog-agent/pull/41689/files#diff-1e048e3bd44ca70b6119fbcd02b2890d0eeb568c7ec448e557e227c289ddb933R1004) with multiple fetch iterations. - Did a QA with a Python script that set up a SNMPSim device with custom max batch size for each SNMP operation, and it worked as expected. ### Additional Notes
1 parent 48e764c commit 4344049

10 files changed

Lines changed: 1178 additions & 46 deletions

File tree

pkg/collector/corechecks/snmp/internal/devicecheck/devicecheck.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ type DeviceCheck struct {
142142
sender *report.MetricSender
143143
session session.Session
144144
sessionFactory session.Factory
145+
oidBatchSizeOptimizers *fetch.OidBatchSizeOptimizers
145146
devicePinger pinger.Pinger
146147
sessionCloseErrorCount *atomic.Uint64
147148
savedDynamicTags []string
@@ -173,6 +174,7 @@ func NewDeviceCheck(config *checkconfig.CheckConfig, ipAddress string, sessionFa
173174
d := DeviceCheck{
174175
config: newConfig,
175176
sessionFactory: sessionFactory,
177+
oidBatchSizeOptimizers: fetch.NewOidBatchSizeOptimizers(newConfig.OidBatchSize),
176178
devicePinger: devicePinger,
177179
sessionCloseErrorCount: atomic.NewUint64(0),
178180
diagnoses: diagnoses.NewDeviceDiagnoses(newConfig.DeviceID),
@@ -390,8 +392,8 @@ func (d *DeviceCheck) getValuesAndTags() (bool, profiledefinition.ProfileDefinit
390392

391393
tags = append(tags, profile.StaticTags...)
392394

393-
valuesStore, err := fetch.Fetch(d.session, d.profileCache.scalarOIDs, d.profileCache.columnOIDs, d.config.OidBatchSize,
394-
d.config.BulkMaxRepetitions)
395+
valuesStore, err := fetch.Fetch(d.session, d.profileCache.scalarOIDs, d.profileCache.columnOIDs,
396+
d.oidBatchSizeOptimizers, d.config.BulkMaxRepetitions)
395397
if log.ShouldLog(log.DebugLvl) {
396398
log.Debugf("fetched values: %v", valuestore.ResultValueStoreAsString(valuesStore))
397399
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2025-present Datadog, Inc.
5+
6+
package fetch
7+
8+
import (
9+
"time"
10+
11+
"github.com/DataDog/datadog-agent/pkg/util/log"
12+
)
13+
14+
const (
15+
onSuccessIncreaseValue = 1
16+
onFailureDecreaseFactor = 2
17+
18+
failuresWindowDuration = 60 * time.Minute
19+
maxFailuresPerWindow = 2
20+
)
21+
22+
// OidBatchSizeOptimizers holds oidBatchSizeOptimizer for each SNMP request operation
23+
type OidBatchSizeOptimizers struct {
24+
snmpGetOptimizer *oidBatchSizeOptimizer
25+
snmpGetBulkOptimizer *oidBatchSizeOptimizer
26+
snmpGetNextOptimizer *oidBatchSizeOptimizer
27+
lastRefreshTs time.Time
28+
}
29+
30+
// oidBatchSizeOptimizer holds data between check runs to be able to find an optimized batch size for SNMP requests
31+
type oidBatchSizeOptimizer struct {
32+
snmpOperation snmpOperation
33+
configBatchSize int
34+
batchSize int
35+
failuresByBatchSize map[int]int
36+
lastSuccessfulBatchSize int
37+
}
38+
39+
// NewOidBatchSizeOptimizers creates a OidBatchSizeOptimizers
40+
func NewOidBatchSizeOptimizers(configBatchSize int) *OidBatchSizeOptimizers {
41+
now := time.Now()
42+
43+
return &OidBatchSizeOptimizers{
44+
snmpGetOptimizer: newOidBatchSizeOptimizer(snmpGet, configBatchSize),
45+
snmpGetBulkOptimizer: newOidBatchSizeOptimizer(snmpGetBulk, configBatchSize),
46+
snmpGetNextOptimizer: newOidBatchSizeOptimizer(snmpGetNext, configBatchSize),
47+
lastRefreshTs: now,
48+
}
49+
}
50+
51+
// Refresh refreshes each oidBatchSizeOptimizer in OidBatchSizeOptimizers when outdated
52+
func (o *OidBatchSizeOptimizers) refreshIfOutdated(now time.Time) {
53+
if now.Sub(o.lastRefreshTs) < failuresWindowDuration {
54+
return
55+
}
56+
57+
o.snmpGetOptimizer.refreshFailures()
58+
o.snmpGetBulkOptimizer.refreshFailures()
59+
o.snmpGetNextOptimizer.refreshFailures()
60+
61+
o.lastRefreshTs = now
62+
63+
log.Debug("SNMP batch size optimizers have been refreshed")
64+
}
65+
66+
// newOidBatchSizeOptimizer creates a oidBatchSizeOptimizer
67+
func newOidBatchSizeOptimizer(snmpOperation snmpOperation, configBatchSize int) *oidBatchSizeOptimizer {
68+
return &oidBatchSizeOptimizer{
69+
snmpOperation: snmpOperation,
70+
configBatchSize: configBatchSize,
71+
batchSize: configBatchSize,
72+
failuresByBatchSize: make(map[int]int),
73+
}
74+
}
75+
76+
// refreshFailures refreshes the failures count for each batch size in oidBatchSizeOptimizer
77+
func (o *oidBatchSizeOptimizer) refreshFailures() {
78+
o.failuresByBatchSize = make(map[int]int)
79+
}
80+
81+
// onBatchSizeFailure decreases the batch size and returns whether the batch size changed
82+
func (o *oidBatchSizeOptimizer) onBatchSizeFailure() bool {
83+
o.failuresByBatchSize[o.batchSize]++
84+
85+
oldBatchSize := o.batchSize
86+
87+
newBatchSize := max(o.batchSize/onFailureDecreaseFactor, 1)
88+
if oldBatchSize > o.lastSuccessfulBatchSize && newBatchSize < o.lastSuccessfulBatchSize {
89+
newBatchSize = o.lastSuccessfulBatchSize
90+
}
91+
92+
o.batchSize = newBatchSize
93+
94+
log.Debugf("SNMP fetch using %s with batch size %d failed, new batch size is %d",
95+
o.snmpOperation, oldBatchSize, newBatchSize)
96+
97+
return oldBatchSize != newBatchSize
98+
}
99+
100+
// onBatchSizeSuccess increases the batch size
101+
func (o *oidBatchSizeOptimizer) onBatchSizeSuccess() {
102+
o.lastSuccessfulBatchSize = o.batchSize
103+
104+
if o.batchSize >= o.maxBatchSize() {
105+
return
106+
}
107+
108+
newBatchSize := min(o.batchSize+onSuccessIncreaseValue, o.maxBatchSize())
109+
if o.failuresByBatchSize[newBatchSize] >= maxFailuresPerWindow {
110+
return
111+
}
112+
113+
log.Debugf("SNMP fetch using %s with batch size %d success, new batch size is %d",
114+
o.snmpOperation, o.lastSuccessfulBatchSize, newBatchSize)
115+
116+
o.batchSize = newBatchSize
117+
}
118+
119+
// maxBatchSize returns the max batch size
120+
func (o *oidBatchSizeOptimizer) maxBatchSize() int {
121+
return o.configBatchSize
122+
}

0 commit comments

Comments
 (0)