Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,35 @@
For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
"""
import os
import time
import argparse
import syslog
from swsscommon.swsscommon import SonicV2Connector
from utilities_common.auto_techsupport_helper import *


def wait_saisdkdump(db, container, name):
"""
wait for saisdkdump to be created by the host before stopping syncd
Happens when the orchagent aborts due to sai programming failure
"""
orch_abrted = db.exists(STATE_DB, ORCH_ABRT_TABLE)
if "swss" not in container and "orchagent" not in name and not orch_abrted:
return

init_time = time.time()
while not os.path.isfile(SDKDUMP_LOCK):
if time.time() - init_time < SDKDUMP_TIMEOUT:
time.sleep(SDKDUMP_SLEEP)
else:
break

if os.path.isfile(SDKDUMP_LOCK):
# Remove the lock once found
syslog.syslog(syslog.LOG_INFO, "Waited until the saisdkdump is collected, proceeding forward..")
os.remove(SDKDUMP_LOCK)


def handle_coredump_cleanup(dump_name, db):
_, num_bytes = get_stats(os.path.join(CORE_DUMP_DIR, CORE_DUMP_PTRN))

Expand Down Expand Up @@ -74,6 +97,7 @@ def main():
syslog.syslog(syslog.LOG_INFO, "Spurious Invocation. {} is not created within last {} sec".format(file_path, TIME_BUF))
return
cls = CriticalProcCoreDumpHandle(args.name, args.container, db)
wait_saisdkdump(db, args.container, args.name)
cls.handle_core_dump_creation_event()
handle_coredump_cleanup(args.name, db)

Expand Down
63 changes: 49 additions & 14 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -1052,21 +1052,26 @@ collect_mellanox() {
local sai_dump_folder="/tmp/saisdkdump"
local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump_$(date +"%m_%d_%Y_%I_%M_%p")"

${CMD_PREFIX}docker exec syncd mkdir -p $sai_dump_folder
${CMD_PREFIX}docker exec syncd saisdkdump -f $sai_dump_filename

if [ $? != 0 ]; then
echo "Failed to collect saisdkdump."
fi
if [[ "$( docker container inspect -f '{{.State.Running}}' syncd )" == "true" ]]; then
if [[ x"$(sonic-db-cli APPL_DB EXISTS PORT_TABLE:PortInitDone)" == x"1" ]]; then
# Run saisdkdump only after the create_switch is known to be successful
${CMD_PREFIX}docker exec syncd mkdir -p $sai_dump_folder
${CMD_PREFIX}${timeout_cmd} docker exec syncd saisdkdump -f $sai_dump_filename

if [ $? != 0 ]; then
echo "Failed to collect saisdkdump."
fi

copy_from_docker syncd $sai_dump_folder $sai_dump_folder
echo "$sai_dump_folder"
for file in `ls $sai_dump_folder`; do
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
done
copy_from_docker syncd $sai_dump_folder $sai_dump_folder
echo "$sai_dump_folder"
for file in `ls $sai_dump_folder`; do
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
done

${CMD_PREFIX}rm -rf $sai_dump_folder
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder
${CMD_PREFIX}rm -rf $sai_dump_folder
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder
fi
fi

# run 'hw-management-generate-dump.sh' script and save the result file
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
Expand All @@ -1086,7 +1091,6 @@ collect_mellanox() {
else
echo "HW Mgmt dump script $HW_DUMP_FILE does not exist"
fi

}

###############################################################################
Expand Down Expand Up @@ -1428,6 +1432,36 @@ save_crash_files() {
fi
}

###############################################################################
# Collect saisdkdump files under /var/log/orch_abrt_sdkdump. These files are
# created because of the orchagent abort triggered by SAI programming failure
# Globals:
# None
# Arguments:
# None
# Returns:
# None
###############################################################################
save_saisdkdump_files(){
for file in $(find_files "/var/log/orch_abrt_sdkdump/"); do
if $TAR -tf $TARFILE | grep $BASE/log/$(basename $file); then
# if the files are already collected under the log/ dir
# just add a symbolic link
if [ ! -z "${file##*.gz}" ]; then
# files saved under log/ are zipped with gz
file=$file.gz
fi
${CMD_PREFIX}save_symlink ${file} orch_abrt_sdkdump log
else
if [ ! -z "${file##*.gz}" ]; then
${CMD_PREFIX}save_file ${file} orch_abrt_sdkdump true
else
${CMD_PREFIX}save_file ${file} orch_abrt_sdkdump false
fi
fi
done
}

###############################################################################
# Get number of ASICs in the platform
# Globals:
Expand Down Expand Up @@ -1705,6 +1739,7 @@ main() {
save_log_files
save_crash_files
save_warmboot_files
save_saisdkdump_files

if [[ "$asic" = "mellanox" ]]; then
collect_mellanox_dfw_dumps
Expand Down
26 changes: 26 additions & 0 deletions tests/coredump_gen_handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ class TestCoreDumpCreationEvent(unittest.TestCase):
def setUp(self):
cdump_mod.TIME_BUF = 1
cdump_mod.WAIT_BUFFER = 1
cdump_mod.SDKDUMP_TIMEOUT = 1.6
cdump_mod.SDKDUMP_SLEEP = 0.2

def test_invoc_ts_state_db_update(self):
"""
Expand Down Expand Up @@ -469,3 +471,27 @@ def mock_cmd(cmd, env):
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock)
cls.handle_core_dump_creation_event()

def test_wait_for_sdkdump(self):
"""
Scenario: Check if the auto-techsupport waits until the saisdkdump is created.
"""
db_wrap = Db()
redis_mock = db_wrap.db
# Set Orch Abort Status
redis_mock.get_redis_client(cdump_mod.STATE_DB).set(cdump_mod.ORCH_ABRT_TABLE, "1")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(5) # 5 seconds
try:
with Patcher() as patcher:
patcher.fs.create_file(cdump_mod.SDKDUMP_LOCK)
cdump_mod.wait_saisdkdump(redis_mock, "swss0", "orchagent.xxxx.yy.core.gz")
assert not os.path.isfile(cdump_mod.SDKDUMP_LOCK)
except Exception:
assert False, "wait_saisdkdump should not time out"
finally:
signal.alarm(0)

curr = time.time()
cdump_mod.wait_saisdkdump(redis_mock, "swss0", "orchagent.xxxx.yy.core.gz")
assert time.time() - curr >= cdump_mod.SDKDUMP_TIMEOUT - 0.1
8 changes: 7 additions & 1 deletion utilities_common/auto_techsupport_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF",
"SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY",
"EXT_SUCCESS", "MAX_RETRY_LIMIT", "EVENT_TYPE", "EVENT_TYPE_CORE",
"EVENT_TYPE_MEMORY"
"EVENT_TYPE_MEMORY", "SDKDUMP_LOCK", "SDKDUMP_TIMEOUT", "SDKDUMP_SLEEP",
"ORCH_ABRT_TABLE"
] + [ # Methods
"verify_recent_file_creation",
"get_ts_dumps",
Expand Down Expand Up @@ -70,6 +71,11 @@
SINCE_DEFAULT = "2 days ago"
TS_GLOBAL_TIMEOUT = "60"

SDKDUMP_LOCK = "/tmp/saidump_collection_notify_flag"
SDKDUMP_TIMEOUT = 120
SDKDUMP_SLEEP = 5
ORCH_ABRT_TABLE = "ORCH_ABRT_STATUS"

# Explicity Pass this to the subprocess invoking techsupport
ENV_VAR = os.environ
if ('CROSS_BUILD_ENVIRON' not in ENV_VAR) or (ENV_VAR['CROSS_BUILD_ENVIRON'] != 'y'):
Expand Down