Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ LOG_MSG "Dropping caches..."
LOG_MSG 3 > /proc/sys/vm/drop_caches
LOG_MSG "Cache cleared."

sleep 2
LOG_MSG "Pre-reboot flush and cache clear complete."

sleep 2
LOG_MSG "Writing pre-shutdown success to CPLD reg"
docker exec "$(cat /host/dpu-docker-info/name)" /nic/bin/cpldapp -w 0xd 200

LOG_MSG "Platform specific pre-shutdown is successful"

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"services_to_ignore": ["vlanmgrd", "vxlanmgrd"],
"devices_to_ignore": ["psu","fan"],
"user_defined_checkers": ["fetch_dpu_status"],
"polling_interval": 60,
"polling_interval": 20,
"led_color": {
"fault": "amber",
"normal": "green",
Expand Down
8 changes: 6 additions & 2 deletions files/dsc/dpu.init
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ function start_polaris()
mkdir -p $HOST_DIR_POLARIS/obfl
mkdir -p $HOST_DIR_POLARIS/data
mkdir -p $HOST_DIR_POLARIS/share
mount -t tmpfs -o size=20M sharetmps $HOST_DIR_POLARIS/share
mount -t tmpfs -o size=20M tmpfs $HOST_DIR_POLARIS/share
mount -t tmpfs -o size=5M tmpfs $HOST_DIR_POLARIS/external
mkdir -p $HOST_DIR_POLARIS/dsc
mount -t tmpfs -o size=5M dsctmps $HOST_DIR_POLARIS/dsc
mkdir -p $HOST_DIR_POLARIS/mnt
Expand All @@ -61,7 +62,7 @@ function start_polaris()

docker ps -a --format "{{.ID}}\t{{.Image}}" | grep "$IMAGE_NAME:$TAG" | awk '{print $1}' | xargs -I {} docker rm {}

docker run -v $HOST_DIR_POLARIS/update:/update -v $HOST_DIR_POLARIS/sysconfig/config0:/sysconfig/config0 -v $HOST_DIR_POLARIS/sysconfig/config1:/sysconfig/config1 -v $HOST_DIR_POLARIS/obfl/a:/obfl -v $HOST_DIR_POLARIS/obfl:/var/log/obfl -v $HOST_DIR_POLARIS/data:/data -v $HOST_DIR_POLARIS/share:/share -v $HOST_DIR_POLARIS/share:/external -v $HOST_DIR_POLARIS/mnt/a:/ro -v /dev:/dev -v /sys:/sys --net=host --name=$CONTAINER_NAME_POLARIS --privileged $IMAGE_NAME:$TAG &
docker run -v $HOST_DIR_POLARIS/update:/update -v $HOST_DIR_POLARIS/sysconfig/config0:/sysconfig/config0 -v $HOST_DIR_POLARIS/sysconfig/config1:/sysconfig/config1 -v $HOST_DIR_POLARIS/obfl/a:/obfl -v $HOST_DIR_POLARIS/obfl:/var/log/obfl -v $HOST_DIR_POLARIS/data:/data -v $HOST_DIR_POLARIS/share:/share -v $HOST_DIR_POLARIS/external:/external -v $HOST_DIR_POLARIS/mnt/a:/ro -v /dev:/dev -v /sys:/sys --net=host --name=$CONTAINER_NAME_POLARIS --privileged $IMAGE_NAME:$TAG &

if [ -f /boot/first_boot ]; then
device="/usr/share/sonic/device"
Expand All @@ -74,6 +75,9 @@ function start_polaris()
sleep 5
INTERFACE="eth0-midplane"
if ip link show "$INTERFACE" &> /dev/null; then
echo "dhclient -r $INTERFACE"
/usr/sbin/dhclient -r $INTERFACE
sleep 1
echo "dhclient $INTERFACE"
/usr/sbin/dhclient $INTERFACE
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ Name=eth0-midplane

[Network]
DHCP=yes

[DHCPv4]
ClientIdentifier=mac
2 changes: 1 addition & 1 deletion platform/checkout/pensando.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[module]
repo=git@github.com:pensando/pensando-sonic-artifacts.git
ref=1.87.0-SS-15-release
ref=1.95.0-SS-17-release
path=platform/pensando/pensando-sonic-artifacts
15 changes: 13 additions & 2 deletions platform/pensando/one-image.mk
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,21 @@ $(SONIC_ONE_IMAGE)_INSTALLS += $(DPU_MODULE)
$(SONIC_ONE_IMAGE)_INSTALLS += $(IONIC_MODULE)
$(SONIC_ONE_IMAGE)_INSTALLS += $(PENSANDO_DPU_PLATFORM_MODULE)


DISABLED_DOCKERS = $(DOCKER_SFLOW) $(DOCKER_MGMT_FRAMEWORK) $(DOCKER_NAT) $(DOCKER_TEAMD) $(DOCKER_ROUTER_ADVERTISER) $(DOCKER_MUX) $(DOCKER_SNMP)
DISABLED_PACKAGES_LOCAL = $(DOCKER_DHCP_RELAY) $(DOCKER_MACSEC)
DISABLED_FEATURE_FLAGS = INCLUDE_SFLOW INCLUDE_MGMT_FRAMEWORK INCLUDE_NAT INCLUDE_MACSEC INCLUDE_TEAMD INCLUDE_ROUTER_ADVERTISER INCLUDE_MUX
$(info Disabling the following docker images: $(DISABLED_DOCKERS))
$(info Disabling the following packages: $(DISABLED_PACKAGES_LOCAL))
$(info Disabling the following feauture flags: $(DISABLED_FEATURE_FLAGS))

SONIC_PACKAGES_LOCAL := $(filter-out $(DISABLED_PACKAGES_LOCAL), $(SONIC_PACKAGES_LOCAL))
$(foreach feature, $(DISABLED_FEATURE_FLAGS), $(eval override $(feature)=n ))

ifeq ($(INSTALL_DEBUG_TOOLS),y)
$(SONIC_ONE_IMAGE)_DOCKERS += $(SONIC_INSTALL_DOCKER_DBG_IMAGES)
$(SONIC_ONE_IMAGE)_DOCKERS += $(filter-out $(patsubst %-$(DBG_IMAGE_MARK).gz,%.gz, $(SONIC_INSTALL_DOCKER_DBG_IMAGES)), $(SONIC_INSTALL_DOCKER_IMAGES))
$(SONIC_ONE_IMAGE)_DOCKERS += $(filter-out $(patsubst %-$(DBG_IMAGE_MARK).gz,%.gz, $(SONIC_INSTALL_DOCKER_DBG_IMAGES)), $(filter-out $(DISABLED_DOCKERS), $(SONIC_INSTALL_DOCKER_IMAGES)))
else
$(SONIC_ONE_IMAGE)_DOCKERS = $(SONIC_INSTALL_DOCKER_IMAGES)
$(SONIC_ONE_IMAGE)_DOCKERS = $(filter-out $(DISABLED_DOCKERS), $(SONIC_INSTALL_DOCKER_IMAGES))
endif
SONIC_INSTALLERS += $(SONIC_ONE_IMAGE)
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ After=dpu-platform-init.service
[Service]
ExecStart=/usr/bin/python3 /usr/local/bin/dpu_db_util.py
Restart=always
StandardOutput=syslog+console
StandardError=syslog+console
StandardOutput=journal+console
StandardError=journal+console

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,15 @@

import sys
import signal
import subprocess
import threading
import time
from datetime import datetime
import json
import docker
import redis
import syslog
from sonic_py_common import daemon_base, logger, syslogger
import multiprocessing
import grpc
from concurrent import futures
import queue

SYSLOG_IDENTIFIER = 'dpu-db-utild'
logger_instance = syslogger.SysLogger(SYSLOG_IDENTIFIER)
Expand All @@ -29,7 +26,6 @@ def log_err(msg, also_print_to_console=False):
logger_instance.log_error(msg, also_print_to_console)

try:
from health_checker.manager import HealthCheckerManager
from sonic_py_common import daemon_base
import sonic_platform
from sonic_platform.chassis import Chassis
Expand Down Expand Up @@ -84,11 +80,12 @@ def __init__(self, chassis, db):
super(EventHandler, self).__init__(SYSLOG_IDENTIFIER)

# operd attributes
self.events = []
self.events.extend(CRITICAL_EVENTS)
self.events.extend(NETWORK_EVENTS)
self.event_thread = None
self.event_stop = False
self.event_types = []
self.event_types.extend(CRITICAL_EVENTS)
self.event_types.extend(NETWORK_EVENTS)
self.request_queue = queue.Queue()
self.thread = None
self.stop_event = threading.Event()

# dpu state db update related attributes
try:
Expand Down Expand Up @@ -242,25 +239,13 @@ def _update_dpu_control_plane_db(self):
except Exception as e:
log_err(f'Failed to populate dpu control plane entries due to {e}')

def _getGrpcEventMessage(self):
while True:
if self.event_stop:
return
grpcmsg = oper_pb2.OperInfoRequest()
spec = grpcmsg.Request.add()
spec.InfoType = oper_pb2.OPER_INFO_TYPE_EVENT
spec.Action = oper_pb2.OPER_INFO_OP_SUBSCRIBE
for event in self.events:
spec.EventFilter.Types.append(event)
yield grpcmsg

def _process_event(self, event):
global g_count

try:
event_type = event.EventInfo.Type
event_description = event.EventInfo.Description
event_message = event.EventInfo.Message
event_type = event.Type
event_description = event.Description
event_message = event.Message
except Exception as e:
log_err(f"Failed to process event due to {e}")
return
Expand Down Expand Up @@ -301,29 +286,60 @@ def _process_event(self, event):
except Exception as e:
log_err(f"Failed to update dpu state db control plane fields due to {e}")

def _build_event_request(self):
"""Builds a single gRPC subscription request."""
request = oper_pb2.OperInfoRequest()
spec = request.Request.add()
spec.InfoType = oper_pb2.OPER_INFO_TYPE_EVENT
spec.Action = oper_pb2.OPER_INFO_OP_SUBSCRIBE
for t in self.event_types:
spec.EventFilter.Types.append(t)
return request

def _request_generator(self):
"""Generator that yields the gRPC request once."""
yield self._build_event_request()
while not self.stop_event.is_set():
time.sleep(1)

def _event_listener(self):
channel_addr = "{}:{}".format(LOCALHOST,str(EVENT_PORT))
channel_addr = f"{LOCALHOST}:{EVENT_PORT}"
log_info(f"Connecting to gRPC server at {channel_addr}")
channel = grpc.insecure_channel(channel_addr)
stub = oper_pb2_grpc.OperSvcStub(channel)
resp = stub.OperInfoSubscribe(self._getGrpcEventMessage())
time.sleep(1)
for event in resp:
self._process_event(event)

try:
response_stream = stub.OperInfoSubscribe(self._request_generator())
for response in response_stream:
if self.stop_event.is_set():
break
if response.Status != 0:
log_info(f"Received non-OK status: {response.Status}")
continue
event = response.EventInfo
try:
self._process_event(event)
except Exception as e:
log_err(f"_process_event() raised exception: {e}")
except grpc.RpcError as e:
log_err(f"gRPC error: {e}")
finally:
log_info("Event listener thread exiting")

def start(self):
# spawn operd listener thread
self.event_stop = False
if (self.event_thread == None) or (not self.event_thread.is_alive()):
self.event_thread = threading.Thread(target=self._event_listener)
self.event_thread.daemon = True
self.event_thread.start()
if self.thread is None or not self.thread.is_alive():
self.stop_event.clear()
self.thread = threading.Thread(target=self._event_listener, daemon=True)
self.thread.start()
log_info("Event listener thread started")

def stop(self):
self.log_warning("Stopping event listener thread")
self.event_stop = True
if self.event_thread is not None:
self.event_thread.join()
self.event_thread = None
log_info("Stopping event listener thread")
self.stop_event.set()
if self.thread is not None:
self.thread.join(timeout=2)
log_info("Event listener thread stopped")
self.thread = None

#
# Daemon =======================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@
#
#############################################################################

import sys
import subprocess
from datetime import datetime
import json
import syslog
from datetime import datetime, timezone, timedelta
import docker
import redis
from sonic_py_common import syslogger
Expand All @@ -26,9 +22,7 @@ def log_err(msg, also_print_to_console=False):
logger_instance.log_error(msg, also_print_to_console)

try:
from swsscommon import swsscommon
from health_checker.manager import HealthCheckerManager
from sonic_py_common import daemon_base
from sonic_platform.chassis import Chassis
from sonic_platform.helper import APIHelper
except Exception as e:
Expand All @@ -42,10 +36,13 @@ STATE_DB = 6
REDIS_LOCALHOST_SERVER_PORT = 6379
REDIS_LOCALHOST_SERVER_IP = '127.0.0.1'
NOT_AVAILABLE = 'N/A'

STARTUP_CONTAINER_LIST = ['swss', 'syncd', 'database']
CONTAINERS_TO_IGNORE = ['macsec']
try:
apiHelper = APIHelper()
chassis = Chassis()
dpu_docker_name = apiHelper.get_dpu_docker_container_name()
STARTUP_CONTAINER_LIST.append(dpu_docker_name)
except Exception as e:
log_err(f'failed to fetch dpu docker name due to {e}')

Expand All @@ -67,6 +64,14 @@ def bool_to_link_status(status):
else:
return "down"

def parse_docker_time(ts: str) -> datetime:
ts = ts.rstrip("Z")
if '.' in ts:
base, fraction = ts.split('.')
ts = f"{base}.{fraction[:6]}"
dt = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%f")
return dt.replace(tzinfo=timezone.utc)

class DPUHealthUpdater():

def __init__(self, chassis):
Expand Down Expand Up @@ -117,18 +122,31 @@ class DPUHealthUpdater():
client = docker.from_env()
containers = client.containers.list(all=True)
all_container_status = True
startup_container_status = True
swss_container_uptime = 0
container_not_running = []
container_restarting = []
reason = ""
for container in containers:
container_name = container.name
container_status = container.status
if container_name in CONTAINERS_TO_IGNORE:
continue
if container_status == 'restarting':
if container_name in STARTUP_CONTAINER_LIST:
startup_container_status &= False
all_container_status &= False
container_restarting.append(container_name)
elif container_status == 'exited':
if container_name in STARTUP_CONTAINER_LIST:
startup_container_status &= False
all_container_status &= False
container_not_running.append(container_name)
if container_name == "swss" and startup_container_status:
swss_container_uptime = datetime.now(timezone.utc) - parse_docker_time(container.attrs['State']['StartedAt'])
if startup_container_status and swss_container_uptime < timedelta(minutes=2):
reason = "Early stage containers (swss, syncd, database) are up and running"
return startup_container_status, reason
if container_not_running:
reason += "Container not running : " + ', '.join(container_not_running)
if container_restarting:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

NUM_THERMAL = 2
NUM_VOLTAGE_SENSORS = 6
NUM_CURRENT_SENSORS = 3
NUM_CURRENT_SENSORS = 6
HOST_REBOOT_CAUSE_PATH = "/host/reboot-cause/"
REBOOT_CAUSE_FILE = "reboot-cause.txt"
HOST_CHK_CMD = "docker > /dev/null 2>&1"
Expand Down Expand Up @@ -140,7 +140,7 @@ def __initialize_thermals(self):
global NUM_THERMAL
board_id = self._api_helper.get_board_id()
if board_id == self._api_helper.mtfuji_board_id:
NUM_THERMAL = 5
NUM_THERMAL = 8
if Thermal._thermals_available():
for index in range(0, NUM_THERMAL):
thermal = Thermal(index)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@

# [ Sensor-Name, sysfs, low_threshold, high_threshold, critical_low, critical_high]
CURRENT_SENSOR_MAPPING = [
["Current sensor 1", "/sys/class/hwmon/hwmon0/curr1_input", "0", "15100", NOT_AVAILABLE, "30000"],
["Current sensor 2", "/sys/class/hwmon/hwmon1/curr1_input", "0", "13800", NOT_AVAILABLE, "30000"],
["Current sensor 3", "/sys/class/hwmon/hwmon2/curr1_input", "0", "79100", NOT_AVAILABLE, "30000"]
["VP0P85_VDD_DDR_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/curr1_input", "0", "15100", NOT_AVAILABLE, "30000"],
["VP1P2_DDR_VDDQ_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/curr2_input", "0", "13800", NOT_AVAILABLE, "30000"],
["VP0P75_VDD_CORE_DPU0 1", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/curr1_input", "0", "25000", NOT_AVAILABLE, "30000"],
["VP0P75_VDD_CORE_DPU0 2", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/curr2_input", "0", "25000", NOT_AVAILABLE, "30000"],
["VP0P75_VDD_CORE_DPU0 3", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/curr1_input", "0", "25000", NOT_AVAILABLE, "30000"],
["VP0P85_VDD_ARM_DPU0", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/curr2_input", "0", "29100", NOT_AVAILABLE, "30000"],
]

class VoltageSensor(SensorBase):
Expand Down Expand Up @@ -207,3 +210,4 @@ def get_low_critical_threshold(self):
if value == NOT_AVAILABLE:
return NOT_AVAILABLE
return float(value)

Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@ class Thermal(ThermalBase):
SENSOR_MAPPING_MTFUJI = [
["Die temperature", "/sys/class/hwmon/hwmon0/temp2_input", 1, 110, -10, 130],
["Board temperature", "/sys/class/hwmon/hwmon0/temp1_input", 1, 110, -10, 130],
["Thermal sensor 1", "/sys/class/hwmon/hwmon0/temp1_input", 1, 110, -10, 130],
["Thermal sensor 2", "/sys/class/hwmon/hwmon1/temp1_input", 1, 110, -10, 130],
["Thermal sensor 3", "/sys/class/hwmon/hwmon2/temp1_input", 1, 110, -10, 130]
["VP0P85_VDD_DDR_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/temp2_input", 1, 110, -10, 130],
["VP1P2_DDR_VDDQ_DPU0", "/sys/bus/i2c/devices/0-0044/hwmon/hwmon2/temp3_input", 1, 110, -10, 130],
["VP0P75_VDD_CORE_DPU0 1", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/temp2_input", 1, 110, -10, 130],
["VP0P75_VDD_CORE_DPU0 2", "/sys/bus/i2c/devices/0-0055/hwmon/hwmon1/temp3_input", 1, 110, -10, 130],
["VP0P75_VDD_CORE_DPU0 3", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/temp2_input", 1, 110, -10, 130],
["VP0P85_VDD_ARM_DPU0", "/sys/bus/i2c/devices/0-0066/hwmon/hwmon0/temp3_input", 1, 110, -10, 130],
]

@classmethod
Expand All @@ -43,7 +46,7 @@ def _thermals_available(cls):
apiHelper = APIHelper()
g_board_id = apiHelper.get_board_id()
temp_hwmon = '/sys/bus/i2c/devices/i2c-0/0-004c/hwmon'
if g_board_id == self._api_helper.mtfuji_board_id:
if g_board_id == apiHelper.mtfuji_board_id:
temp_hwmon = '/sys/class/hwmon/hwmon0/temp1_input'
if os.path.exists(temp_hwmon):
return True
Expand Down
Loading