Skip to content

Commit c007d65

Browse files
authored
[warm-reboot] Add new preboot health check: verify database integrity (sonic-net#1785)
What I did Verify database integrity before proceeding with warm reboot or fast reboot. This integrity check uses a JSON schema to validate DBs. To start with, only counters_db's table COUNTERS_PORT_NAME_MAP presence is verified. But, this list can advance in future. The test logic is designed to be generic; any more databases or tables within them can be just added to schema list, and the verification logic needs no change. How I did it Added a JSON schema, and generic schema validation logic.
1 parent 41e31e8 commit c007d65

2 files changed

Lines changed: 112 additions & 3 deletions

File tree

scripts/check_db_integrity.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This is to verify if Database has critical tables present before warmboot can proceed.
5+
If warmboot is allowed with missing critical tables, it can lead to issues in going
6+
down path or during the recovery path. This test detects such issues before proceeding.
7+
The verification procedure here uses JSON schemas to verify the DB entities.
8+
9+
In future, to verify new tables or their content, just the schema modification is needed.
10+
No modification may be needed to the integrity check logic.
11+
"""
12+
13+
import os, sys
14+
import json, jsonschema
15+
import syslog
16+
import subprocess
17+
import traceback
18+
19+
DB_SCHEMA = {
20+
"COUNTERS_DB":
21+
{
22+
"$schema": "http://json-schema.org/draft-06/schema",
23+
"type": "object",
24+
"title": "Schema for COUNTERS DB's entities",
25+
"required": ["COUNTERS_PORT_NAME_MAP"],
26+
"properties": {
27+
"COUNTERS_PORT_NAME_MAP": {"$id": "#/properties/COUNTERS_PORT_NAME_MAP", "type": "object"}
28+
}
29+
}
30+
}
31+
32+
33+
def main():
34+
if not DB_SCHEMA:
35+
return 0
36+
37+
for db_name, schema in DB_SCHEMA.items():
38+
db_dump_file = "/tmp/{}.json".format(db_name)
39+
dump_db_cmd = "sonic-db-dump -n 'COUNTERS_DB' -y > {}".format(db_dump_file)
40+
p = subprocess.Popen(dump_db_cmd, shell=True, text=True,
41+
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
42+
(_, err) = p.communicate()
43+
rc = p.wait()
44+
if rc != 0:
45+
print("Failed to dump db {}. Return code: {} with err: {}".format(db_name, rc, err))
46+
47+
try:
48+
with open(db_dump_file) as fp:
49+
db_dump_data = json.load(fp)
50+
except ValueError as err:
51+
syslog.syslog(syslog.LOG_DEBUG, "DB json file is not a valid json file. " +\
52+
"Error: {}".format(str(err)))
53+
return 1
54+
55+
# What: Validate if critical tables and entries are present in DB.
56+
# Why: This is needed to avoid warmbooting with a bad DB; which can
57+
# potentially trigger failures in the reboot recovery path.
58+
# How: Validate DB against a schema which defines required tables.
59+
try:
60+
jsonschema.validate(instance=db_dump_data, schema=schema)
61+
except jsonschema.exceptions.ValidationError as err:
62+
syslog.syslog(syslog.LOG_ERR, "Database is missing tables/entries needed for reboot procedure. " +\
63+
"DB integrity check failed with:\n{}".format(str(err.message)))
64+
return 1
65+
syslog.syslog(syslog.LOG_DEBUG, "Database integrity checks passed.")
66+
return 0
67+
68+
69+
if __name__ == '__main__':
70+
res = 0
71+
try:
72+
res = main()
73+
except KeyboardInterrupt:
74+
syslog.syslog(syslog.LOG_NOTICE, "SIGINT received. Quitting")
75+
res = 1
76+
except Exception as e:
77+
syslog.syslog(syslog.LOG_ERR, "Got an exception %s: Traceback: %s" % (str(e), traceback.format_exc()))
78+
res = 2
79+
finally:
80+
syslog.closelog()
81+
try:
82+
sys.exit(res)
83+
except SystemExit:
84+
os._exit(res)

scripts/fast-reboot

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ SHUTDOWN_ORDER_FILE="/etc/sonic/${REBOOT_TYPE}_order"
1212
VERBOSE=no
1313
FORCE=no
1414
IGNORE_ASIC=no
15+
IGNORE_DB_CHECK=no
1516
STRICT=no
1617
REBOOT_METHOD="/sbin/kexec -e"
1718
ASSISTANT_IP_LIST=""
@@ -38,6 +39,7 @@ EXIT_SYNCD_SHUTDOWN=11
3839
EXIT_FAST_REBOOT_DUMP_FAILURE=12
3940
EXIT_FILTER_FDB_ENTRIES_FAILURE=13
4041
EXIT_COUNTERPOLL_DELAY_FAILURE=14
42+
EXIT_DB_INTEGRITY_FAILURE=15
4143
EXIT_NO_CONTROL_PLANE_ASSISTANT=20
4244
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
4345

@@ -59,8 +61,9 @@ function showHelpAndExit()
5961
echo "Usage: ${REBOOT_SCRIPT_NAME} [options]"
6062
echo " -h,-? : get this help"
6163
echo " -v : turn on verbose"
62-
echo " -f : force execution"
63-
echo " -i : ignore MD5-checksum-verification of ASIC configuration files"
64+
echo " -f : force execution - ignore Orchagent RESTARTCHECK failure"
65+
echo " -i : force execution - ignore ASIC MD5-checksum-verification"
66+
echo " -d : force execution - ignore database integrity check"
6467
echo " -r : reboot with /sbin/reboot"
6568
echo " -k : reboot with /sbin/kexec -e [default]"
6669
echo " -x : execute script with -x flag"
@@ -74,7 +77,7 @@ function showHelpAndExit()
7477

7578
function parseOptions()
7679
{
77-
while getopts "vfih?rkxc:s" opt; do
80+
while getopts "vfidh?rkxc:s" opt; do
7881
case ${opt} in
7982
h|\? )
8083
showHelpAndExit
@@ -88,6 +91,9 @@ function parseOptions()
8891
i )
8992
IGNORE_ASIC=yes
9093
;;
94+
d )
95+
IGNORE_DB_CHECK=yes
96+
;;
9197
r )
9298
REBOOT_METHOD="/sbin/reboot"
9399
;;
@@ -327,6 +333,23 @@ function check_docker_exec()
327333
done
328334
}
329335
336+
function check_db_integrity()
337+
{
338+
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
339+
CHECK_DB_INTEGRITY=0
340+
/usr/local/bin/check_db_integrity.py || CHECK_DB_INTEGRITY=$?
341+
if [[ CHECK_DB_INTEGRITY -ne 0 ]]; then
342+
if [[ x"${IGNORE_DB_CHECK}" == x"yes" ]]; then
343+
debug "Ignoring Database integrity checks..."
344+
else
345+
error "Failed to validate DB's integrity. Exit code: ${CHECK_DB_INTEGRITY}. \
346+
Use '-d' option to force ignore this check."
347+
exit ${EXIT_DB_INTEGRITY_FAILURE}
348+
fi
349+
fi
350+
fi
351+
}
352+
330353
function reboot_pre_check()
331354
{
332355
check_docker_exec
@@ -337,6 +360,8 @@ function reboot_pre_check()
337360
fi
338361
rm ${filename}
339362
363+
check_db_integrity
364+
340365
# Make sure /host has enough space for warm reboot temp files
341366
avail=$(df -k /host | tail -1 | awk '{ print $4 }')
342367
if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then

0 commit comments

Comments
 (0)