Skip to content

Commit f647a50

Browse files
qiluo-msftFengPan-Frank
authored andcommitted
service_checker should consider k8s rollout containers (#25660)
Why I did it K8s rollout containers have long names, and we will use container label to identify its original container/service names such as "restapi". This PR will extend service_checker to respect either image native containers or k8s rollout containers for the same service. Work item tracking Microsoft ADO (number only): How I did it Check all containers labels. How to verify it Unit test. Manually verified on a sonic device with k8s rollout restapi container. Signed-off-by: Feng Pan <[email protected]>
1 parent fc95d63 commit f647a50

File tree

2 files changed

+152
-0
lines changed

2 files changed

+152
-0
lines changed

src/system-health/health_checker/service_checker.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,20 @@ def get_current_running_containers(self):
152152
lst = ctrs.list(filters={"status": "running"})
153153

154154
for ctr in lst:
155+
# Check if this is a Kubernetes-managed container
156+
labels = ctr.labels or {}
157+
ns = labels.get("io.kubernetes.pod.namespace")
158+
dtype = labels.get("io.kubernetes.docker.type")
159+
kname = labels.get("io.kubernetes.container.name")
160+
161+
if ns == "sonic":
162+
# Kubernetes-managed container - add service name to running containers
163+
# but skip critical process checking (k8s has its own health mechanisms)
164+
if dtype == "container" and kname and kname not in ("<no value>", "POD"):
165+
running_containers.add(kname)
166+
continue
167+
168+
# Regular Docker container - use the container name
155169
running_containers.add(ctr.name)
156170
if ctr.name not in self.container_critical_processes:
157171
self.fill_critical_process_by_container(ctr.name)

src/system-health/tests/test_system_health.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,144 @@ def test_service_checker_check_by_monit(mock_run):
354354
assert checker._info['diskCheck'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
355355

356356

357+
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
358+
@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))
359+
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False))
360+
@patch('docker.DockerClient')
361+
@patch('health_checker.utils.run_command')
362+
@patch('swsscommon.swsscommon.ConfigDBConnector')
363+
def test_service_checker_k8s_containers(mock_config_db, mock_run, mock_docker_client):
364+
"""Test that service checker recognizes Kubernetes-managed containers by labels"""
365+
setup()
366+
mock_db_data = MagicMock()
367+
mock_get_table = MagicMock()
368+
mock_db_data.get_table = mock_get_table
369+
mock_config_db.return_value = mock_db_data
370+
mock_get_table.return_value = {
371+
'snmp': {
372+
'state': 'enabled',
373+
'has_global_scope': 'True',
374+
'has_per_asic_scope': 'False',
375+
},
376+
'restapi': {
377+
'state': 'enabled',
378+
'has_global_scope': 'True',
379+
'has_per_asic_scope': 'False',
380+
}
381+
}
382+
383+
# Mock Kubernetes containers with labels
384+
mock_containers = MagicMock()
385+
mock_snmp_container = MagicMock()
386+
mock_snmp_container.name = 'k8s_snmp_snmp-pod-test_sonic_12345678-1234-1234-1234-123456789abc_0'
387+
mock_snmp_container.labels = {
388+
'io.kubernetes.pod.namespace': 'sonic',
389+
'io.kubernetes.docker.type': 'container',
390+
'io.kubernetes.container.name': 'snmp'
391+
}
392+
393+
mock_restapi_container = MagicMock()
394+
mock_restapi_container.name = 'k8s_restapi_restapi-pod-test_sonic_87654321-4321-4321-4321-cba987654321_0'
395+
mock_restapi_container.labels = {
396+
'io.kubernetes.pod.namespace': 'sonic',
397+
'io.kubernetes.docker.type': 'container',
398+
'io.kubernetes.container.name': 'restapi'
399+
}
400+
401+
# Mock POD container (should be ignored)
402+
mock_pod_container = MagicMock()
403+
mock_pod_container.name = 'k8s_POD_snmp-pod-test_sonic_12345678-1234-1234-1234-123456789abc_0'
404+
mock_pod_container.labels = {
405+
'io.kubernetes.pod.namespace': 'sonic',
406+
'io.kubernetes.docker.type': 'container',
407+
'io.kubernetes.container.name': 'POD'
408+
}
409+
410+
mock_containers.list = MagicMock(return_value=[mock_snmp_container, mock_restapi_container, mock_pod_container])
411+
mock_docker_client_object = MagicMock()
412+
mock_docker_client.return_value = mock_docker_client_object
413+
mock_docker_client_object.containers = mock_containers
414+
415+
mock_run.return_value = mock_supervisorctl_output
416+
417+
checker = ServiceChecker()
418+
config = Config()
419+
checker.check(config)
420+
421+
# Verify k8s containers are recognized by their label names
422+
running_containers = checker.get_current_running_containers()
423+
assert 'snmp' in running_containers
424+
assert 'restapi' in running_containers
425+
assert 'POD' not in running_containers
426+
427+
# Verify k8s containers are NOT added to critical processes (k8s has its own health checks)
428+
assert 'snmp' not in checker.container_critical_processes
429+
assert 'restapi' not in checker.container_critical_processes
430+
431+
432+
@patch('swsscommon.swsscommon.ConfigDBConnector.connect', MagicMock())
433+
@patch('health_checker.service_checker.ServiceChecker._get_container_folder', MagicMock(return_value=test_path))
434+
@patch('sonic_py_common.multi_asic.is_multi_asic', MagicMock(return_value=False))
435+
@patch('docker.DockerClient')
436+
@patch('health_checker.utils.run_command')
437+
@patch('swsscommon.swsscommon.ConfigDBConnector')
438+
def test_service_checker_mixed_containers(mock_config_db, mock_run, mock_docker_client):
439+
"""Test that service checker handles both regular Docker and Kubernetes containers"""
440+
setup()
441+
mock_db_data = MagicMock()
442+
mock_get_table = MagicMock()
443+
mock_db_data.get_table = mock_get_table
444+
mock_config_db.return_value = mock_db_data
445+
mock_get_table.return_value = {
446+
'swss': {
447+
'state': 'enabled',
448+
'has_global_scope': 'True',
449+
'has_per_asic_scope': 'False',
450+
},
451+
'database': {
452+
'state': 'enabled',
453+
'has_global_scope': 'True',
454+
'has_per_asic_scope': 'False',
455+
}
456+
}
457+
458+
mock_containers = MagicMock()
459+
460+
# Regular Docker container
461+
mock_swss_container = MagicMock()
462+
mock_swss_container.name = 'swss'
463+
mock_swss_container.labels = {}
464+
465+
# Kubernetes container
466+
mock_database_container = MagicMock()
467+
mock_database_container.name = 'k8s_database_database-pod-test_sonic_12345678_0'
468+
mock_database_container.labels = {
469+
'io.kubernetes.pod.namespace': 'sonic',
470+
'io.kubernetes.docker.type': 'container',
471+
'io.kubernetes.container.name': 'database'
472+
}
473+
474+
mock_containers.list = MagicMock(return_value=[mock_swss_container, mock_database_container])
475+
mock_docker_client_object = MagicMock()
476+
mock_docker_client.return_value = mock_docker_client_object
477+
mock_docker_client_object.containers = mock_containers
478+
479+
mock_run.return_value = mock_supervisorctl_output
480+
481+
checker = ServiceChecker()
482+
config = Config()
483+
checker.check(config)
484+
485+
# Verify both types of containers are recognized
486+
running_containers = checker.get_current_running_containers()
487+
assert 'swss' in running_containers
488+
assert 'database' in running_containers
489+
490+
# Verify only regular Docker containers are monitored for critical processes
491+
assert 'swss' in checker.container_critical_processes
492+
assert 'database' not in checker.container_critical_processes # k8s container, not monitored
493+
494+
357495
def test_hardware_checker():
358496
MockConnector.data.update({
359497
'TEMPERATURE_INFO|ASIC': {

0 commit comments

Comments
 (0)