diff --git a/device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook b/device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook index 574e42a1a0..6a1903c2bf 100755 --- a/device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook +++ b/device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook @@ -21,6 +21,12 @@ LOG_MSG "Dropping caches..." LOG_MSG 3 > /proc/sys/vm/drop_caches LOG_MSG "Cache cleared." +sleep 2 LOG_MSG "Pre-reboot flush and cache clear complete." + +sleep 2 +LOG_MSG "Writing pre-shutdown success to CPLD reg" +docker exec "$(cat /host/dpu-docker-info/name)" /nic/bin/cpldapp -w 0xd 200 + LOG_MSG "Platform specific pre-shutdown is successful" diff --git a/device/pensando/arm64-elba-asic-flash128-r0/system_health_monitoring_config.json b/device/pensando/arm64-elba-asic-flash128-r0/system_health_monitoring_config.json index b4440ac504..f292822b85 100644 --- a/device/pensando/arm64-elba-asic-flash128-r0/system_health_monitoring_config.json +++ b/device/pensando/arm64-elba-asic-flash128-r0/system_health_monitoring_config.json @@ -2,7 +2,7 @@ "services_to_ignore": ["vlanmgrd", "vxlanmgrd"], "devices_to_ignore": ["psu","fan"], "user_defined_checkers": ["fetch_dpu_status"], - "polling_interval": 60, + "polling_interval": 20, "led_color": { "fault": "amber", "normal": "green", diff --git a/files/dsc/dpu.init b/files/dsc/dpu.init index 5f424f7a5d..e636bf77b1 100755 --- a/files/dsc/dpu.init +++ b/files/dsc/dpu.init @@ -41,7 +41,8 @@ function start_polaris() mkdir -p $HOST_DIR_POLARIS/obfl mkdir -p $HOST_DIR_POLARIS/data mkdir -p $HOST_DIR_POLARIS/share - mount -t tmpfs -o size=20M sharetmps $HOST_DIR_POLARIS/share + mount -t tmpfs -o size=20M tmpfs $HOST_DIR_POLARIS/share + mount -t tmpfs -o size=5M tmpfs $HOST_DIR_POLARIS/external mkdir -p $HOST_DIR_POLARIS/dsc mount -t tmpfs -o size=5M dsctmps $HOST_DIR_POLARIS/dsc mkdir -p $HOST_DIR_POLARIS/mnt @@ -61,7 +62,7 @@ function start_polaris() docker ps -a --format "{{.ID}}\t{{.Image}}" | grep "$IMAGE_NAME:$TAG" | awk '{print $1}' | xargs -I {} docker rm {} - docker run -v $HOST_DIR_POLARIS/update:/update -v $HOST_DIR_POLARIS/sysconfig/config0:/sysconfig/config0 -v $HOST_DIR_POLARIS/sysconfig/config1:/sysconfig/config1 -v $HOST_DIR_POLARIS/obfl/a:/obfl -v $HOST_DIR_POLARIS/obfl:/var/log/obfl -v $HOST_DIR_POLARIS/data:/data -v $HOST_DIR_POLARIS/share:/share -v $HOST_DIR_POLARIS/share:/external -v $HOST_DIR_POLARIS/mnt/a:/ro -v /dev:/dev -v /sys:/sys --net=host --name=$CONTAINER_NAME_POLARIS --privileged $IMAGE_NAME:$TAG & + docker run -v $HOST_DIR_POLARIS/update:/update -v $HOST_DIR_POLARIS/sysconfig/config0:/sysconfig/config0 -v $HOST_DIR_POLARIS/sysconfig/config1:/sysconfig/config1 -v $HOST_DIR_POLARIS/obfl/a:/obfl -v $HOST_DIR_POLARIS/obfl:/var/log/obfl -v $HOST_DIR_POLARIS/data:/data -v $HOST_DIR_POLARIS/share:/share -v $HOST_DIR_POLARIS/external:/external -v $HOST_DIR_POLARIS/mnt/a:/ro -v /dev:/dev -v /sys:/sys --net=host --name=$CONTAINER_NAME_POLARIS --privileged $IMAGE_NAME:$TAG & if [ -f /boot/first_boot ]; then device="/usr/share/sonic/device" @@ -74,6 +75,9 @@ function start_polaris() sleep 5 INTERFACE="eth0-midplane" if ip link show "$INTERFACE" &> /dev/null; then + echo "dhclient -r $INTERFACE" + /usr/sbin/dhclient -r $INTERFACE + sleep 1 echo "dhclient $INTERFACE" /usr/sbin/dhclient $INTERFACE fi diff --git a/files/image_config/midplane-network/midplane-network-dpu.network b/files/image_config/midplane-network/midplane-network-dpu.network index bb4c58ed9a..2f9f3759bd 100644 --- a/files/image_config/midplane-network/midplane-network-dpu.network +++ b/files/image_config/midplane-network/midplane-network-dpu.network @@ -3,3 +3,6 @@ Name=eth0-midplane [Network] DHCP=yes + +[DHCPv4] +ClientIdentifier=mac diff --git a/platform/checkout/pensando.ini b/platform/checkout/pensando.ini index 8faffbfc22..eea4792edc 100644 --- a/platform/checkout/pensando.ini +++ b/platform/checkout/pensando.ini @@ -1,4 +1,4 @@ [module] repo=git@github.com:pensando/pensando-sonic-artifacts.git -ref=1.87.0-SS-15-release +ref=1.95.0-SS-17-release path=platform/pensando/pensando-sonic-artifacts diff --git a/platform/pensando/one-image.mk b/platform/pensando/one-image.mk index 8055c719ed..3741df0766 100644 --- a/platform/pensando/one-image.mk +++ b/platform/pensando/one-image.mk @@ -9,10 +9,21 @@ $(SONIC_ONE_IMAGE)_INSTALLS += $(DPU_MODULE) $(SONIC_ONE_IMAGE)_INSTALLS += $(IONIC_MODULE) $(SONIC_ONE_IMAGE)_INSTALLS += $(PENSANDO_DPU_PLATFORM_MODULE) + +DISABLED_DOCKERS = $(DOCKER_SFLOW) $(DOCKER_MGMT_FRAMEWORK) $(DOCKER_NAT) $(DOCKER_TEAMD) $(DOCKER_ROUTER_ADVERTISER) $(DOCKER_MUX) $(DOCKER_SNMP) +DISABLED_PACKAGES_LOCAL = $(DOCKER_DHCP_RELAY) $(DOCKER_MACSEC) +DISABLED_FEATURE_FLAGS = INCLUDE_SFLOW INCLUDE_MGMT_FRAMEWORK INCLUDE_NAT INCLUDE_MACSEC INCLUDE_TEAMD INCLUDE_ROUTER_ADVERTISER INCLUDE_MUX +$(info Disabling the following docker images: $(DISABLED_DOCKERS)) +$(info Disabling the following packages: $(DISABLED_PACKAGES_LOCAL)) +$(info Disabling the following feauture flags: $(DISABLED_FEATURE_FLAGS)) + +SONIC_PACKAGES_LOCAL := $(filter-out $(DISABLED_PACKAGES_LOCAL), $(SONIC_PACKAGES_LOCAL)) +$(foreach feature, $(DISABLED_FEATURE_FLAGS), $(eval override $(feature)=n )) + ifeq ($(INSTALL_DEBUG_TOOLS),y) $(SONIC_ONE_IMAGE)_DOCKERS += $(SONIC_INSTALL_DOCKER_DBG_IMAGES) -$(SONIC_ONE_IMAGE)_DOCKERS += $(filter-out $(patsubst %-$(DBG_IMAGE_MARK).gz,%.gz, $(SONIC_INSTALL_DOCKER_DBG_IMAGES)), $(SONIC_INSTALL_DOCKER_IMAGES)) +$(SONIC_ONE_IMAGE)_DOCKERS += $(filter-out $(patsubst %-$(DBG_IMAGE_MARK).gz,%.gz, $(SONIC_INSTALL_DOCKER_DBG_IMAGES)), $(filter-out $(DISABLED_DOCKERS), $(SONIC_INSTALL_DOCKER_IMAGES))) else -$(SONIC_ONE_IMAGE)_DOCKERS = $(SONIC_INSTALL_DOCKER_IMAGES) +$(SONIC_ONE_IMAGE)_DOCKERS = $(filter-out $(DISABLED_DOCKERS), $(SONIC_INSTALL_DOCKER_IMAGES)) endif SONIC_INSTALLERS += $(SONIC_ONE_IMAGE) diff --git a/platform/pensando/sonic-platform-modules-dpu/dpu/service/dpu-db-util.service b/platform/pensando/sonic-platform-modules-dpu/dpu/service/dpu-db-util.service index e59b1f9a53..b44c5713f1 100644 --- a/platform/pensando/sonic-platform-modules-dpu/dpu/service/dpu-db-util.service +++ b/platform/pensando/sonic-platform-modules-dpu/dpu/service/dpu-db-util.service @@ -5,8 +5,8 @@ After=dpu-platform-init.service [Service] ExecStart=/usr/bin/python3 /usr/local/bin/dpu_db_util.py Restart=always -StandardOutput=syslog+console -StandardError=syslog+console +StandardOutput=journal+console +StandardError=journal+console [Install] WantedBy=multi-user.target diff --git a/platform/pensando/sonic-platform-modules-dpu/dpu/utils/dpu_db_util.py b/platform/pensando/sonic-platform-modules-dpu/dpu/utils/dpu_db_util.py index 6a3d58447e..5f3489cdbc 100644 --- a/platform/pensando/sonic-platform-modules-dpu/dpu/utils/dpu_db_util.py +++ b/platform/pensando/sonic-platform-modules-dpu/dpu/utils/dpu_db_util.py @@ -6,18 +6,15 @@ import sys import signal -import subprocess import threading import time from datetime import datetime -import json import docker import redis import syslog from sonic_py_common import daemon_base, logger, syslogger -import multiprocessing import grpc -from concurrent import futures +import queue SYSLOG_IDENTIFIER = 'dpu-db-utild' logger_instance = syslogger.SysLogger(SYSLOG_IDENTIFIER) @@ -29,7 +26,6 @@ def log_err(msg, also_print_to_console=False): logger_instance.log_error(msg, also_print_to_console) try: - from health_checker.manager import HealthCheckerManager from sonic_py_common import daemon_base import sonic_platform from sonic_platform.chassis import Chassis @@ -84,11 +80,12 @@ def __init__(self, chassis, db): super(EventHandler, self).__init__(SYSLOG_IDENTIFIER) # operd attributes - self.events = [] - self.events.extend(CRITICAL_EVENTS) - self.events.extend(NETWORK_EVENTS) - self.event_thread = None - self.event_stop = False + self.event_types = [] + self.event_types.extend(CRITICAL_EVENTS) + self.event_types.extend(NETWORK_EVENTS) + self.request_queue = queue.Queue() + self.thread = None + self.stop_event = threading.Event() # dpu state db update related attributes try: @@ -242,25 +239,13 @@ def _update_dpu_control_plane_db(self): except Exception as e: log_err(f'Failed to populate dpu control plane entries due to {e}') - def _getGrpcEventMessage(self): - while True: - if self.event_stop: - return - grpcmsg = oper_pb2.OperInfoRequest() - spec = grpcmsg.Request.add() - spec.InfoType = oper_pb2.OPER_INFO_TYPE_EVENT - spec.Action = oper_pb2.OPER_INFO_OP_SUBSCRIBE - for event in self.events: - spec.EventFilter.Types.append(event) - yield grpcmsg - def _process_event(self, event): global g_count try: - event_type = event.EventInfo.Type - event_description = event.EventInfo.Description - event_message = event.EventInfo.Message + event_type = event.Type + event_description = event.Description + event_message = event.Message except Exception as e: log_err(f"Failed to process event due to {e}") return @@ -301,29 +286,60 @@ def _process_event(self, event): except Exception as e: log_err(f"Failed to update dpu state db control plane fields due to {e}") + def _build_event_request(self): + """Builds a single gRPC subscription request.""" + request = oper_pb2.OperInfoRequest() + spec = request.Request.add() + spec.InfoType = oper_pb2.OPER_INFO_TYPE_EVENT + spec.Action = oper_pb2.OPER_INFO_OP_SUBSCRIBE + for t in self.event_types: + spec.EventFilter.Types.append(t) + return request + + def _request_generator(self): + """Generator that yields the gRPC request once.""" + yield self._build_event_request() + while not self.stop_event.is_set(): + time.sleep(1) + def _event_listener(self): - channel_addr = "{}:{}".format(LOCALHOST,str(EVENT_PORT)) + channel_addr = f"{LOCALHOST}:{EVENT_PORT}" + log_info(f"Connecting to gRPC server at {channel_addr}") channel = grpc.insecure_channel(channel_addr) stub = oper_pb2_grpc.OperSvcStub(channel) - resp = stub.OperInfoSubscribe(self._getGrpcEventMessage()) - time.sleep(1) - for event in resp: - self._process_event(event) + + try: + response_stream = stub.OperInfoSubscribe(self._request_generator()) + for response in response_stream: + if self.stop_event.is_set(): + break + if response.Status != 0: + log_info(f"Received non-OK status: {response.Status}") + continue + event = response.EventInfo + try: + self._process_event(event) + except Exception as e: + log_err(f"_process_event() raised exception: {e}") + except grpc.RpcError as e: + log_err(f"gRPC error: {e}") + finally: + log_info("Event listener thread exiting") def start(self): - # spawn operd listener thread - self.event_stop = False - if (self.event_thread == None) or (not self.event_thread.is_alive()): - self.event_thread = threading.Thread(target=self._event_listener) - self.event_thread.daemon = True - self.event_thread.start() + if self.thread is None or not self.thread.is_alive(): + self.stop_event.clear() + self.thread = threading.Thread(target=self._event_listener, daemon=True) + self.thread.start() + log_info("Event listener thread started") def stop(self): - self.log_warning("Stopping event listener thread") - self.event_stop = True - if self.event_thread is not None: - self.event_thread.join() - self.event_thread = None + log_info("Stopping event listener thread") + self.stop_event.set() + if self.thread is not None: + self.thread.join(timeout=2) + log_info("Event listener thread stopped") + self.thread = None # # Daemon ======================================================================= diff --git a/platform/pensando/sonic-platform-modules-dpu/dpu/utils/fetch_dpu_status b/platform/pensando/sonic-platform-modules-dpu/dpu/utils/fetch_dpu_status index 0dc99800b9..c139e52320 100755 --- a/platform/pensando/sonic-platform-modules-dpu/dpu/utils/fetch_dpu_status +++ b/platform/pensando/sonic-platform-modules-dpu/dpu/utils/fetch_dpu_status @@ -7,11 +7,7 @@ # ############################################################################# -import sys -import subprocess -from datetime import datetime -import json -import syslog +from datetime import datetime, timezone, timedelta import docker import redis from sonic_py_common import syslogger @@ -26,9 +22,7 @@ def log_err(msg, also_print_to_console=False): logger_instance.log_error(msg, also_print_to_console) try: - from swsscommon import swsscommon from health_checker.manager import HealthCheckerManager - from sonic_py_common import daemon_base from sonic_platform.chassis import Chassis from sonic_platform.helper import APIHelper except Exception as e: @@ -42,10 +36,13 @@ STATE_DB = 6 REDIS_LOCALHOST_SERVER_PORT = 6379 REDIS_LOCALHOST_SERVER_IP = '127.0.0.1' NOT_AVAILABLE = 'N/A' - +STARTUP_CONTAINER_LIST = ['swss', 'syncd', 'database'] +CONTAINERS_TO_IGNORE = ['macsec'] try: apiHelper = APIHelper() chassis = Chassis() + dpu_docker_name = apiHelper.get_dpu_docker_container_name() + STARTUP_CONTAINER_LIST.append(dpu_docker_name) except Exception as e: log_err(f'failed to fetch dpu docker name due to {e}') @@ -67,6 +64,14 @@ def bool_to_link_status(status): else: return "down" +def parse_docker_time(ts: str) -> datetime: + ts = ts.rstrip("Z") + if '.' in ts: + base, fraction = ts.split('.') + ts = f"{base}.{fraction[:6]}" + dt = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%f") + return dt.replace(tzinfo=timezone.utc) + class DPUHealthUpdater(): def __init__(self, chassis): @@ -117,18 +122,31 @@ class DPUHealthUpdater(): client = docker.from_env() containers = client.containers.list(all=True) all_container_status = True + startup_container_status = True + swss_container_uptime = 0 container_not_running = [] container_restarting = [] reason = "" for container in containers: container_name = container.name container_status = container.status + if container_name in CONTAINERS_TO_IGNORE: + continue if container_status == 'restarting': + if container_name in STARTUP_CONTAINER_LIST: + startup_container_status &= False all_container_status &= False container_restarting.append(container_name) elif container_status == 'exited': + if container_name in STARTUP_CONTAINER_LIST: + startup_container_status &= False all_container_status &= False container_not_running.append(container_name) + if container_name == "swss" and startup_container_status: + swss_container_uptime = datetime.now(timezone.utc) - parse_docker_time(container.attrs['State']['StartedAt']) + if startup_container_status and swss_container_uptime < timedelta(minutes=2): + reason = "Early stage containers (swss, syncd, database) are up and running" + return startup_container_status, reason if container_not_running: reason += "Container not running : " + ', '.join(container_not_running) if container_restarting: diff --git a/platform/pensando/sonic-platform-modules-dpu/sonic_platform/chassis.py b/platform/pensando/sonic-platform-modules-dpu/sonic_platform/chassis.py index c6f092baf8..0885ec88e7 100644 --- a/platform/pensando/sonic-platform-modules-dpu/sonic_platform/chassis.py +++ b/platform/pensando/sonic-platform-modules-dpu/sonic_platform/chassis.py @@ -23,7 +23,7 @@ NUM_THERMAL = 2 NUM_VOLTAGE_SENSORS = 6 -NUM_CURRENT_SENSORS = 3 +NUM_CURRENT_SENSORS = 6 HOST_REBOOT_CAUSE_PATH = "/host/reboot-cause/" REBOOT_CAUSE_FILE = "reboot-cause.txt" HOST_CHK_CMD = "docker > /dev/null 2>&1" @@ -140,7 +140,7 @@ def __initialize_thermals(self): global NUM_THERMAL board_id = self._api_helper.get_board_id() if board_id == self._api_helper.mtfuji_board_id: - NUM_THERMAL = 5 + NUM_THERMAL = 8 if Thermal._thermals_available(): for index in range(0, NUM_THERMAL): thermal = Thermal(index) diff --git a/platform/pensando/sonic-platform-modules-dpu/sonic_platform/sensor.py b/platform/pensando/sonic-platform-modules-dpu/sonic_platform/sensor.py index 19b165da1a..c83bf4ca5f 100644 --- a/platform/pensando/sonic-platform-modules-dpu/sonic_platform/sensor.py +++ b/platform/pensando/sonic-platform-modules-dpu/sonic_platform/sensor.py @@ -29,9 +29,12 @@ # [ Sensor-Name, sysfs, low_threshold, high_threshold, critical_low, critical_high] CURRENT_SENSOR_MAPPING = [ - ["Current sensor 1", "/sys/class/hwmon/hwmon0/curr1_input", "0", "15100", NOT_AVAILABLE, "30000"], - ["Current sensor 2", "/sys/class/hwmon/hwmon1/curr1_input", "0", "13800", NOT_AVAILABLE, "30000"], - ["Current sensor 3", "/sys/class/hwmon/hwmon2/curr1_input", "0", "79100", NOT_AVAILABLE, "30000"] + ["VP0P85_VDD_DDR_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/curr1_input", "0", "15100", NOT_AVAILABLE, "30000"], + ["VP1P2_DDR_VDDQ_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/curr2_input", "0", "13800", NOT_AVAILABLE, "30000"], + ["VP0P75_VDD_CORE_DPU0 1", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/curr1_input", "0", "25000", NOT_AVAILABLE, "30000"], + ["VP0P75_VDD_CORE_DPU0 2", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/curr2_input", "0", "25000", NOT_AVAILABLE, "30000"], + ["VP0P75_VDD_CORE_DPU0 3", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/curr1_input", "0", "25000", NOT_AVAILABLE, "30000"], + ["VP0P85_VDD_ARM_DPU0", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/curr2_input", "0", "29100", NOT_AVAILABLE, "30000"], ] class VoltageSensor(SensorBase): @@ -207,3 +210,4 @@ def get_low_critical_threshold(self): if value == NOT_AVAILABLE: return NOT_AVAILABLE return float(value) + diff --git a/platform/pensando/sonic-platform-modules-dpu/sonic_platform/thermal.py b/platform/pensando/sonic-platform-modules-dpu/sonic_platform/thermal.py index f4d262be18..10dd541fc1 100644 --- a/platform/pensando/sonic-platform-modules-dpu/sonic_platform/thermal.py +++ b/platform/pensando/sonic-platform-modules-dpu/sonic_platform/thermal.py @@ -31,9 +31,12 @@ class Thermal(ThermalBase): SENSOR_MAPPING_MTFUJI = [ ["Die temperature", "/sys/class/hwmon/hwmon0/temp2_input", 1, 110, -10, 130], ["Board temperature", "/sys/class/hwmon/hwmon0/temp1_input", 1, 110, -10, 130], - ["Thermal sensor 1", "/sys/class/hwmon/hwmon0/temp1_input", 1, 110, -10, 130], - ["Thermal sensor 2", "/sys/class/hwmon/hwmon1/temp1_input", 1, 110, -10, 130], - ["Thermal sensor 3", "/sys/class/hwmon/hwmon2/temp1_input", 1, 110, -10, 130] + ["VP0P85_VDD_DDR_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/temp2_input", 1, 110, -10, 130], + ["VP1P2_DDR_VDDQ_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/temp3_input", 1, 110, -10, 130], + ["VP0P75_VDD_CORE_DPU0 1", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/temp2_input", 1, 110, -10, 130], + ["VP0P75_VDD_CORE_DPU0 2", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/temp3_input", 1, 110, -10, 130], + ["VP0P75_VDD_CORE_DPU0 3", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/temp2_input", 1, 110, -10, 130], + ["VP0P85_VDD_ARM_DPU0", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/temp3_input", 1, 110, -10, 130], ] @classmethod @@ -43,7 +46,7 @@ def _thermals_available(cls): apiHelper = APIHelper() g_board_id = apiHelper.get_board_id() temp_hwmon = '/sys/bus/i2c/devices/i2c-0/0-004c/hwmon' - if g_board_id == self._api_helper.mtfuji_board_id: + if g_board_id == apiHelper.mtfuji_board_id: temp_hwmon = '/sys/class/hwmon/hwmon0/temp1_input' if os.path.exists(temp_hwmon): return True