-
Notifications
You must be signed in to change notification settings - Fork 809
[fastboot] fastboot enhancement: Use warm-boot infrastructure for fast-boot #2286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5271e5a
83770a5
5154f42
e1cc628
b5f4629
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,8 +41,6 @@ EXIT_FILE_SYSTEM_FULL=3 | |
| EXIT_NEXT_IMAGE_NOT_EXISTS=4 | ||
| EXIT_ORCHAGENT_SHUTDOWN=10 | ||
| EXIT_SYNCD_SHUTDOWN=11 | ||
| EXIT_FAST_REBOOT_DUMP_FAILURE=12 | ||
| EXIT_FILTER_FDB_ENTRIES_FAILURE=13 | ||
| EXIT_COUNTERPOLL_DELAY_FAILURE=14 | ||
| EXIT_DB_INTEGRITY_FAILURE=15 | ||
| EXIT_NO_CONTROL_PLANE_ASSISTANT=20 | ||
|
|
@@ -130,32 +128,29 @@ function parseOptions() | |
| done | ||
| } | ||
|
|
||
| function common_clear() | ||
| function clear_boot() | ||
| { | ||
| # common_clear | ||
| debug "${REBOOT_TYPE} failure ($?) cleanup ..." | ||
|
|
||
| /sbin/kexec -u || /bin/true | ||
|
|
||
| teardown_control_plane_assistant | ||
| } | ||
|
|
||
| function clear_fast_boot() | ||
| { | ||
| common_clear | ||
|
|
||
| sonic-db-cli STATE_DB DEL "FAST_REBOOT|system" &>/dev/null || /bin/true | ||
| } | ||
|
|
||
| function clear_warm_boot() | ||
| { | ||
| common_clear | ||
| #clear_warm_boot | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
| result=$(timeout 10s config warm_restart disable; res=$?; if [[ $res == 124 ]]; then echo timeout; else echo "code ($res)"; fi) || /bin/true | ||
| debug "Cancel warm-reboot: ${result}" | ||
|
|
||
| result=$(timeout 10s config warm_restart disable; res=$?; if [[ $res == 124 ]]; then echo timeout; else echo "code ($res)"; fi) || /bin/true | ||
| debug "Cancel warm-reboot: ${result}" | ||
| TIMESTAMP=$(date +%Y%m%d-%H%M%S) | ||
| if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then | ||
| mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true | ||
| fi | ||
| fi | ||
|
|
||
| TIMESTAMP=$(date +%Y%m%d-%H%M%S) | ||
| if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then | ||
| mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true | ||
| #clear_fast_boot | ||
| if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
| sonic-db-cli STATE_DB DEL "FAST_REBOOT|system" &>/dev/null || /bin/true | ||
| fi | ||
| } | ||
|
|
||
|
|
@@ -164,7 +159,7 @@ function init_warm_reboot_states() | |
| # If the current running instance was booted up with warm reboot. Then | ||
| # the current DB contents will likely mark warm reboot is done. | ||
| # Clear these states so that the next boot up image won't get confused. | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
arfeigin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| sonic-db-cli STATE_DB eval " | ||
| for _, key in ipairs(redis.call('keys', 'WARM_RESTART_TABLE|*')) do | ||
| redis.call('hdel', key, 'state') | ||
|
|
@@ -271,7 +266,8 @@ function backup_database() | |
| and not string.match(k, 'FG_ROUTE_TABLE|') \ | ||
| and not string.match(k, 'WARM_RESTART_ENABLE_TABLE|') \ | ||
| and not string.match(k, 'VXLAN_TUNNEL_TABLE|') \ | ||
| and not string.match(k, 'BUFFER_MAX_PARAM_TABLE|') then | ||
| and not string.match(k, 'BUFFER_MAX_PARAM_TABLE|') \ | ||
| and not string.match(k, 'FAST_REBOOT|') then | ||
| redis.call('del', k) | ||
| end | ||
| end | ||
|
|
@@ -381,7 +377,7 @@ function check_docker_exec() | |
|
|
||
| function check_db_integrity() | ||
| { | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
| CHECK_DB_INTEGRITY=0 | ||
| /usr/local/bin/check_db_integrity.py || CHECK_DB_INTEGRITY=$? | ||
| if [[ CHECK_DB_INTEGRITY -ne 0 ]]; then | ||
|
|
@@ -464,7 +460,6 @@ function unload_kernel() | |
| function save_counters_folder() { | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" ]]; then | ||
| debug "Saving counters folder before warmboot..." | ||
|
|
||
| counters_folder="/host/counters" | ||
| if [[ ! -d $counters_folder ]]; then | ||
| mkdir $counters_folder | ||
|
|
@@ -533,9 +528,11 @@ sonic_asic_type=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type) | |
| BOOT_TYPE_ARG="cold" | ||
| case "$REBOOT_TYPE" in | ||
| "fast-reboot") | ||
| check_warm_restart_in_progress | ||
arfeigin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| BOOT_TYPE_ARG=$REBOOT_TYPE | ||
| trap clear_fast_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM | ||
| trap clear_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM | ||
| sonic-db-cli STATE_DB SET "FAST_REBOOT|system" "1" "EX" "180" &>/dev/null | ||
| config warm_restart enable system | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For people who does not know the background, it is really confuse here, I would suggest to add comment here to explain why fastboot requires "enable warmboot".
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, it confused me. |
||
| ;; | ||
| "warm-reboot") | ||
| check_warm_restart_in_progress | ||
|
|
@@ -548,7 +545,7 @@ case "$REBOOT_TYPE" in | |
| else | ||
| BOOT_TYPE_ARG="warm" | ||
| fi | ||
| trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM | ||
| trap clear_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM | ||
| config warm_restart enable system | ||
| ;; | ||
| *) | ||
|
|
@@ -606,34 +603,11 @@ else | |
| load_kernel | ||
| fi | ||
|
|
||
| if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
| # Dump the ARP and FDB tables to files also as default routes for both IPv4 and IPv6 | ||
| # into /host/fast-reboot | ||
| DUMP_DIR=/host/fast-reboot | ||
| mkdir -p $DUMP_DIR | ||
| FAST_REBOOT_DUMP_RC=0 | ||
| /usr/local/bin/fast-reboot-dump.py -t $DUMP_DIR || FAST_REBOOT_DUMP_RC=$? | ||
| if [[ FAST_REBOOT_DUMP_RC -ne 0 ]]; then | ||
| error "Failed to run fast-reboot-dump.py. Exit code: $FAST_REBOOT_DUMP_RC" | ||
| unload_kernel | ||
| exit "${EXIT_FAST_REBOOT_DUMP_FAILURE}" | ||
| fi | ||
|
|
||
| FILTER_FDB_ENTRIES_RC=0 | ||
| # Filter FDB entries using MAC addresses from ARP table | ||
| /usr/local/bin/filter_fdb_entries -f $DUMP_DIR/fdb.json -a $DUMP_DIR/arp.json -c $CONFIG_DB_FILE || FILTER_FDB_ENTRIES_RC=$? | ||
vaibhavhd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if [[ FILTER_FDB_ENTRIES_RC -ne 0 ]]; then | ||
| error "Failed to filter FDb entries. Exit code: $FILTER_FDB_ENTRIES_RC" | ||
| unload_kernel | ||
| exit "${EXIT_FILTER_FDB_ENTRIES_FAILURE}" | ||
| fi | ||
| fi | ||
|
|
||
| init_warm_reboot_states | ||
|
|
||
| setup_control_plane_assistant | ||
|
|
||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then | ||
| if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
arfeigin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Freeze orchagent for warm restart | ||
| # Ask orchagent_restart_check to try freeze 5 times with interval of 2 seconds, | ||
| # it is possible that the orchagent is in transient state and no opportunity to freeze | ||
|
|
@@ -659,6 +633,15 @@ if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then | |
| unload_kernel | ||
| exit "${EXIT_COUNTERPOLL_DELAY_FAILURE}" | ||
| fi | ||
|
|
||
| # Clear all routes except of default routes for faster reconciliation time. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is a delete call (cannot recover from here), shouldn't this step be executed after
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! modified. will update also in #2365 (PR for 202205) |
||
| sonic-db-cli APPL_DB eval " | ||
| for _, k in ipairs(redis.call('keys', '*')) do | ||
| if string.match(k, 'ROUTE_TABLE:') and not string.match(k, 'ROUTE_TABLE:0.0.0.0/0') and not string.match(k, 'ROUTE_TABLE:::/0') then \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does this improve reconciliation time? Is that due to less routes to reconcile? Why do we need this for fast-reboot, if this is not done for warm-reboot? In fast-reboot case, does this mean we will have dataplane impact much sooner than the time when ASIC reset happens?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is the same as the previous implementation of "Fast-Reboot", we save only the default routes.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might be missing something, can you please point me to the present logic for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Saving of default routes was done by the fast-reboot-dump script other than that it was saving also ARP and FDB tables. It was being saved to /host/fast-reboot/default_routes.json
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The new suggested approach was tested side-by-side with the previous approach of dumping the arp/fdb/default_routes to json files and reloading them. This single change improves dataplane downtime by more than a second by itself.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reverted this change, will use the current implementation of using fast-reboot-dump and filter_fdb_entries scripts.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I inspected the last change made - reintroducing the use of fast-reboot-dump and filder_fdb_entries and the implications it had on fast-reboot performance and I wanted to rest you assured that it is not needed. The change of having fast-reboot over warm-reboot infrastructure introduces the call to backup_database which wasn't used previously for fast-reboot. This function backups FDB_TABLE and APP_DB is being dumped as well to dump.rdb so also media settings are being stored and alongside with ARPs. This is being restored in the change made in the sonic-buildimage PR which should be merged together. In this case the issue https://github.com/sonic-net/sonic-buildimage/issues/9165 won't reproduce. The main change that the relevant data will be restored in a similar way to as it is being restored in warm-reboot scenario and not using fdb.json; arp.json; default_routes.json; media_config.json files. If we will call backup_database function without deleting all routes except default routes will result in storing and then restoring all the routes after reset. This will affect dataplane downtime as the amount of routes increases and this is not a desired behavior. The specific implementation wasn't explicitly described in the HLD but the use of warm-reboot infrastructure i.e. use backup_database and dump.rdb is the cause for this change as the current logic of storing arp/fdb/default_routes and media settings is not changed.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't read in detail. but the explanation sound reasonable to me.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tjchadaga can you please check this PR to ensure sonic-net/sonic-buildimage#9165 will not be seen again after this new design change gets merged?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @vaibhavhd - based on the design of backup and restore of app db using dump.rdb, i think media settings should be restored on time to avoid sonic-net/sonic-buildimage#9165 |
||
| redis.call('del', k) | ||
| end | ||
| end | ||
| " 0 > /dev/null | ||
| fi | ||
|
|
||
| # We are fully committed to reboot from this point on because critical | ||
|
|
@@ -726,18 +709,19 @@ for service in ${SERVICES_TO_STOP}; do | |
| if [[ "x$sonic_asic_type" == x"mellanox" ]]; then | ||
| check_issu_bank_file | ||
| fi | ||
| fi | ||
|
|
||
| # Warm reboot: dump state to host disk | ||
| if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then | ||
| sonic-db-cli ASIC_DB FLUSHDB > /dev/null | ||
| sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null | ||
| sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null | ||
| fi | ||
|
|
||
| # TODO: backup_database preserves FDB_TABLE | ||
| # need to cleanup as well for fastfast boot case | ||
| backup_database | ||
| if [[ "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then | ||
| # Advanced reboot: dump state to host disk | ||
| sonic-db-cli ASIC_DB FLUSHDB > /dev/null | ||
| sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null | ||
| sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null | ||
| fi | ||
|
|
||
| # TODO: backup_database preserves FDB_TABLE | ||
| # need to cleanup as well for fastfast boot case | ||
| backup_database | ||
|
|
||
| fi | ||
| done | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.