Skip to content

Commit baa8699

Browse files
lipxumssonicbld
authored andcommitted
[memory utilization] update memory threshold (#19167)
What is the motivation for this PR? There are so many memory above threshold alarm in nightly test How did you do it? Update the FRR memory threshold and make the alarm more readable memory_increase_threshold, FRR has it's own memory management system, not return the memory to system immediately, increase the threshold. 1: top:zebra: update from 64 to 128M 2: frr_bgp: update from 32 to 64M 3: frr_zebra: update from 16 to 64M memory_high_threshold, frr bgp memory usage related to the count of neighbors, increase the threshold. we need to set the threshold according to the count of neighbors in the further. 1: frr_bgp: update from 128 to 256M How did you verify/test it? Run nightly test https://elastictest.org/scheduler/testplan/685ac58d2461750d1f5a11c9
1 parent 643d808 commit baa8699

3 files changed

Lines changed: 111 additions & 42 deletions

File tree

tests/common/plugins/memory_utilization/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,30 @@ The memory utilization plugin automatically monitors memory resources and genera
3535

3636
## Plugin Design
3737

38+
### Memory Utilization Plugin Summary
39+
40+
The memory utilization plugin for SONiC testing automatically monitors memory usage on the Device Under Test (DUT) before and after each test. Its main goals are to ensure that memory usage does not exceed configured thresholds and to detect memory leaks or abnormal increases during test execution.
41+
42+
**Key Features:**
43+
- **Automatic Monitoring:** Runs for all tests unless explicitly disabled.
44+
- **Configurable Thresholds:** Uses JSON files to define memory checks, commands, and thresholds (absolute values or percentages).
45+
- **Multiple Monitors:** Supports system memory, process memory, docker containers, and FRR daemons.
46+
- **Flexible Scope:** Allows global, HWSKU-specific, and test-specific configuration.
47+
- **Failure Reporting:** Fails tests with detailed messages if thresholds are exceeded.
48+
49+
**How It Works:**
50+
1. **Pre-test:** Collects baseline memory usage using configured commands and parsers.
51+
2. **Post-test:** Collects memory usage again and compares with baseline.
52+
3. **Validation:** Checks if usage exceeds high thresholds or if increase is above allowed limits.
53+
4. **Reporting:** Fails the test if any check fails, with clear diagnostics.
54+
55+
**Configuration:**
56+
- Thresholds can be absolute values, percentages, or both (the strictest applies).
57+
- Can be disabled globally or per-test.
58+
- Easily extendable for new memory monitors or custom thresholds.
59+
60+
This plugin helps maintain system stability and quickly identifies memory-related issues during SONiC test runs.
61+
3862
### Configuration Files
3963

4064
The plugin uses two JSON configuration files:

tests/common/plugins/memory_utilization/memory_utilization.py

Lines changed: 81 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,16 @@ def check_memory_thresholds(self, current_values, previous_values):
4949
logger.debug("Current values: {}".format(current_values))
5050

5151
for name, cmd, memory_params, memory_check_fn in self.commands:
52-
logger.debug("Checking thresholds for command: {}".format(name))
53-
5452
for mem_item, thresholds in memory_params.items():
55-
logger.debug("Processing memory item: {}".format(mem_item))
53+
logger.info("Checking thresholds for command: {}-{}".format(name, mem_item))
5654

5755
# Convert thresholds to structured format for consistency
5856
logger.debug("Original thresholds: {}".format(thresholds))
5957
normalized_thresholds = self._normalize_thresholds(thresholds)
6058
logger.debug("Normalized thresholds: {}".format(normalized_thresholds))
6159

62-
current_value = float(current_values.get(name, {}).get(mem_item, 0))
63-
previous_value = float(previous_values.get(name, {}).get(mem_item, 0))
60+
current_value = round(float(current_values.get(name, {}).get(mem_item, 0)), 1)
61+
previous_value = round(float(previous_values.get(name, {}).get(mem_item, 0)), 1)
6462

6563
if current_value == 0 or previous_value == 0:
6664
logger.warning("Skipping memory check for {}-{} due to zero value".format(name, mem_item))
@@ -102,7 +100,17 @@ def check_memory_thresholds(self, current_values, previous_values):
102100
)
103101

104102
def _normalize_thresholds(self, thresholds):
105-
"""Normalize threshold values to structured format."""
103+
"""
104+
Convert legacy or shorthand threshold formats into a consistent structured format.
105+
106+
Purpose:
107+
- Ensures all threshold values are represented as dictionaries with explicit "type" and "value" fields.
108+
- Converts simple numeric values to {"type": "value", "value": ...}
109+
- Converts percentage strings (like "10%") to {"type": "percentage", "value": ...}
110+
- Leaves already-structured or list values unchanged.
111+
112+
This normalization allows the rest of the code to process thresholds in a uniform way.
113+
"""
106114
normalized = {}
107115

108116
for key, value in thresholds.items():
@@ -127,9 +135,9 @@ def _parse_threshold(self, threshold, base_value):
127135
"""
128136
Parse threshold value which can be either:
129137
1. A dict with type and value fields
130-
2. A list of dicts for multiple threshold types
131-
132-
Returns the most restrictive calculated threshold value.
138+
2. A list of dicts for multiple threshold types,
139+
possibly including a {"type": "comparison", "value": "min"/"max"}
140+
Returns the selected calculated threshold value.
133141
"""
134142
logger.debug("Parsing threshold: {} (type: {}) with base value: {}".format(
135143
threshold, type(threshold).__name__, base_value))
@@ -152,7 +160,7 @@ def _parse_threshold(self, threshold, base_value):
152160
if percentage < 0 or percentage > 100:
153161
logger.warning("Percentage threshold outside normal range (0-100): {}%".format(percentage))
154162

155-
calculated = (percentage / 100.0) * base_value
163+
calculated = round((percentage / 100.0) * base_value, 1)
156164
logger.debug("Calculated percentage threshold: {}% of {} = {}".format(
157165
percentage, base_value, calculated))
158166
return calculated
@@ -161,7 +169,7 @@ def _parse_threshold(self, threshold, base_value):
161169
return float('inf')
162170
elif threshold_type == 'value':
163171
try:
164-
value = float(threshold_value)
172+
value = round(float(threshold_value), 1)
165173
if value < 0:
166174
logger.warning("Negative threshold value: {}".format(value))
167175
logger.debug("Using absolute value: {}".format(value))
@@ -178,19 +186,27 @@ def _parse_threshold(self, threshold, base_value):
178186
elif isinstance(threshold, list):
179187
logger.debug("Processing a list of {} thresholds".format(len(threshold)))
180188
thresholds = []
181-
for i, t in enumerate(threshold):
182-
if isinstance(t, dict) and 'type' in t and 'value' in t:
189+
comparison = None
190+
for t in threshold:
191+
if isinstance(t, dict) and t.get("type") == "comparison":
192+
# Accepts "min" or "max"
193+
comparison = t.get("value", None)
194+
elif isinstance(t, dict) and 'type' in t and 'value' in t:
183195
parsed = self._parse_threshold(t, base_value)
184-
logger.debug("List item {}: parsed value = {}".format(i, parsed))
196+
logger.debug("List item: parsed value = {}".format(parsed))
185197
thresholds.append(parsed)
186198
else:
187199
logger.warning("Skipping invalid threshold list item: {}".format(t))
188-
189-
# Return the most restrictive (smallest) threshold
190-
min_value = min(thresholds) if thresholds else float('inf')
191-
logger.debug("Selected minimum threshold from list: {} (from values: {})".format(
192-
min_value, thresholds))
193-
return min_value
200+
if not thresholds:
201+
return float('inf')
202+
if comparison == "max":
203+
selected = round(max(thresholds), 1)
204+
else:
205+
# Default to min if no comparison specified
206+
selected = round(min(thresholds), 1)
207+
logger.info("Selected {} threshold from list: {} (from values: {})".format(
208+
comparison if comparison else "min", selected, thresholds))
209+
return selected
194210

195211
# Handle deprecated formats with warning
196212
elif isinstance(threshold, (int, float, str)):
@@ -201,7 +217,7 @@ def _parse_threshold(self, threshold, base_value):
201217
if isinstance(threshold, str) and threshold.endswith('%'):
202218
try:
203219
percentage = float(threshold.rstrip('%'))
204-
calculated = (percentage / 100.0) * base_value
220+
calculated = round((percentage / 100.0) * base_value, 1)
205221
logger.debug("Calculated legacy percentage threshold: {}% of {} = {}".format(
206222
percentage, base_value, calculated))
207223
return calculated
@@ -211,7 +227,7 @@ def _parse_threshold(self, threshold, base_value):
211227
else:
212228
# Simple value
213229
try:
214-
value = float(threshold)
230+
value = round(float(threshold), 1)
215231
logger.debug("Using legacy absolute threshold: {}".format(value))
216232
return value
217233
except (ValueError, TypeError) as e:
@@ -227,24 +243,49 @@ def _handle_memory_threshold_exceeded(self, name, mem_item, value, threshold,
227243
logger.info("{}:{}, previous_values: {}".format(name, mem_item, previous_values))
228244
logger.info("{}:{}, current_values: {}".format(name, mem_item, current_values))
229245

230-
# Format threshold for display in a more readable format
246+
# Enhanced formatting for value and threshold
247+
def fmt(val, unit="MB"):
248+
if isinstance(val, float) or isinstance(val, int):
249+
return f"{val:.1f} {unit}"
250+
return str(val)
251+
252+
# Determine threshold type and format accordingly
253+
def format_threshold_and_value(threshold, value):
254+
if isinstance(threshold, dict) and 'type' in threshold:
255+
if threshold['type'] == 'percentage':
256+
return f"{value:.1f}%", f"{threshold['value']}"
257+
elif threshold['type'] == 'value':
258+
return fmt(value), fmt(float(threshold['value']))
259+
elif isinstance(threshold, list):
260+
# Find the first threshold dict with type
261+
for t in threshold:
262+
if isinstance(t, dict) and 'type' in t:
263+
return format_threshold_and_value(t, value)
264+
return str(value), str(threshold)
265+
else:
266+
# fallback
267+
return str(value), str(threshold)
268+
231269
threshold_str = self._format_threshold_for_display(threshold)
232270
logger.debug("Threshold exceeded - measured value: {}, formatted threshold: {}".format(
233271
value, threshold_str))
234272

273+
prev_val = previous_values.get(name, {}).get(mem_item, 0)
274+
curr_val = current_values.get(name, {}).get(mem_item, 0)
275+
276+
# Format for increase or high threshold
235277
if is_increase:
278+
val_str, th_str = format_threshold_and_value(threshold, value)
236279
message = (
237-
"[ALARM]: {}:{} memory usage increased by {}, "
238-
"exceeds increase threshold {}".format(
239-
name, mem_item, value, threshold_str
240-
)
280+
"[ALARM]: {}:{} memory usage increased by {}, exceeds increase threshold {} (previous: {}, current: {})"
281+
.format(name, mem_item, val_str, th_str, fmt(prev_val), fmt(curr_val))
241282
)
242283
else:
284+
which = "Current" if is_current else "Previous"
285+
val_str, th_str = format_threshold_and_value(threshold, value)
243286
message = (
244-
"[ALARM]: {}:{}, {} memory usage {} exceeds "
245-
"high threshold {}".format(
246-
name, mem_item, "Current" if is_current else "Previous", value, threshold_str
247-
)
287+
"[ALARM]: {}:{}, {} memory usage {} exceeds high threshold {} (previous: {}, current: {})"
288+
.format(name, mem_item, which, val_str, th_str, fmt(prev_val), fmt(curr_val))
248289
)
249290

250291
# Not return failure on Virtual Switch
@@ -385,9 +426,11 @@ def parse_top_output(output, memory_params):
385426
for mem_item, thresholds in memory_params.items():
386427
if mem_item in process_info["COMMAND"]:
387428
if mem_item in memory_values:
388-
memory_values[mem_item] += float(int(process_info["RES"]) / 1024)
429+
memory_values[mem_item] = round(
430+
memory_values[mem_item] + float(int(process_info["RES"]) / 1024), 1
431+
)
389432
else:
390-
memory_values[mem_item] = float(int(process_info["RES"]) / 1024)
433+
memory_values[mem_item] = round(float(int(process_info["RES"]) / 1024), 1)
391434

392435
logger.debug("Parsed memory values: {}".format(memory_values))
393436
return memory_values
@@ -414,7 +457,7 @@ def parse_free_output(output, memory_params):
414457
swap_info = {headers[i]: int(Swap[i]) for i in range(len(Swap))}
415458

416459
for mem_item, _ in memory_params.items():
417-
memory_values[mem_item] = mem_info.get(mem_item, 0) + swap_info.get(mem_item, 0)
460+
memory_values[mem_item] = round(mem_info.get(mem_item, 0) + swap_info.get(mem_item, 0), 1)
418461

419462
logger.debug("Parsed memory values: {}".format(memory_values))
420463
return memory_values
@@ -437,7 +480,7 @@ def parse_monit_status_output(output, memory_params):
437480
if match:
438481
used_memory = match.group(1) # noqa: F841
439482
memory_percentage = match.group(2)
440-
memory_values['memory_usage'] = float(memory_percentage)
483+
memory_values['memory_usage'] = round(float(memory_percentage), 1)
441484
else:
442485
logger.error("Failed to parse memory usage from line: {}".format(line))
443486
if "swap usage" in line:
@@ -475,7 +518,7 @@ def parse_docker_stats_output(output, memory_params):
475518
match = re.search(pattern, line)
476519
if match:
477520
mem_usage = match.group(2)
478-
memory_values[mem_item] = mem_usage
521+
memory_values[mem_item] = round(float(mem_usage), 1)
479522
else:
480523
logger.error("Failed to parse memory usage from line: {}".format(line))
481524
else:
@@ -511,10 +554,10 @@ def parse_frr_memory_output(output, memory_params):
511554
if unit in unit_multipliers:
512555
memory_bytes = value * unit_multipliers[unit]
513556
# Convert to MB for consistent measurement
514-
memory_values['used'] = memory_bytes / (1024 * 1024)
557+
memory_values['used'] = round(memory_bytes / (1024 * 1024), 1)
515558
else:
516559
logger.warning("Unknown memory unit: {}, treating as bytes".format(unit))
517-
memory_values['used'] = value / (1024 * 1024)
560+
memory_values['used'] = round(value / (1024 * 1024), 1)
518561

519562
except (ValueError, TypeError) as e:
520563
logger.error("Failed to parse FRR memory value: {}".format(e))

tests/common/plugins/memory_utilization/memory_utilization_dependence.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"zebra": {
1818
"memory_increase_threshold": {
1919
"type": "value",
20-
"value": 64
20+
"value": 128
2121
},
2222
"memory_high_threshold": null
2323
}
@@ -169,11 +169,12 @@
169169
"used": {
170170
"memory_increase_threshold": [
171171
{"type": "percentage", "value": "50%"},
172-
{"type": "value", "value": 32}
172+
{"type": "value", "value": 64},
173+
{"type": "comparison", "value": "max"}
173174
],
174175
"memory_high_threshold": {
175176
"type": "value",
176-
"value": 128
177+
"value": 256
177178
}
178179
}
179180
},
@@ -186,7 +187,8 @@
186187
"used": {
187188
"memory_increase_threshold": [
188189
{"type": "percentage", "value": "50%"},
189-
{"type": "value", "value": 16}
190+
{"type": "value", "value": 64},
191+
{"type": "comparison", "value": "max"}
190192
],
191193
"memory_high_threshold": {
192194
"type": "value",

0 commit comments

Comments
 (0)