-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Zebra fpm backpressure #16220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Zebra fpm backpressure #16220
Changes from all commits
34670c4
3af381b
8926ac1
98b11de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -483,10 +483,8 @@ struct zebra_dplane_provider { | |
| int (*dp_fini)(struct zebra_dplane_provider *prov, bool early_p); | ||
|
|
||
| _Atomic uint32_t dp_in_counter; | ||
| _Atomic uint32_t dp_in_queued; | ||
| _Atomic uint32_t dp_in_max; | ||
| _Atomic uint32_t dp_out_counter; | ||
| _Atomic uint32_t dp_out_queued; | ||
| _Atomic uint32_t dp_out_max; | ||
| _Atomic uint32_t dp_error_counter; | ||
|
|
||
|
|
@@ -6129,35 +6127,45 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed) | |
| struct zebra_dplane_provider *prov; | ||
| uint64_t in, in_q, in_max, out, out_q, out_max; | ||
|
|
||
| vty_out(vty, "Zebra dataplane providers:\n"); | ||
|
|
||
| DPLANE_LOCK(); | ||
| prov = dplane_prov_list_first(&zdplane_info.dg_providers); | ||
| in = dplane_ctx_queue_count(&zdplane_info.dg_update_list); | ||
| DPLANE_UNLOCK(); | ||
|
|
||
| vty_out(vty, "dataplane Incoming Queue from Zebra: %" PRIu64 "\n", in); | ||
| vty_out(vty, "Zebra dataplane providers:\n"); | ||
|
|
||
| /* Show counters, useful info from each registered provider */ | ||
| while (prov) { | ||
| dplane_provider_lock(prov); | ||
| in_q = dplane_ctx_queue_count(&prov->dp_ctx_in_list); | ||
| out_q = dplane_ctx_queue_count(&prov->dp_ctx_out_list); | ||
| dplane_provider_unlock(prov); | ||
|
|
||
| in = atomic_load_explicit(&prov->dp_in_counter, | ||
| memory_order_relaxed); | ||
| in_q = atomic_load_explicit(&prov->dp_in_queued, | ||
| memory_order_relaxed); | ||
|
|
||
| in_max = atomic_load_explicit(&prov->dp_in_max, | ||
| memory_order_relaxed); | ||
| out = atomic_load_explicit(&prov->dp_out_counter, | ||
| memory_order_relaxed); | ||
| out_q = atomic_load_explicit(&prov->dp_out_queued, | ||
| memory_order_relaxed); | ||
|
|
||
| out_max = atomic_load_explicit(&prov->dp_out_max, | ||
| memory_order_relaxed); | ||
|
|
||
| vty_out(vty, "%s (%u): in: %"PRIu64", q: %"PRIu64", q_max: %"PRIu64", out: %"PRIu64", q: %"PRIu64", q_max: %"PRIu64"\n", | ||
| prov->dp_name, prov->dp_id, in, in_q, in_max, | ||
| out, out_q, out_max); | ||
| vty_out(vty, | ||
| " %s (%u): in: %" PRIu64 ", q: %" PRIu64 | ||
| ", q_max: %" PRIu64 ", out: %" PRIu64 ", q: %" PRIu64 | ||
| ", q_max: %" PRIu64 "\n", | ||
| prov->dp_name, prov->dp_id, in, in_q, in_max, out, | ||
| out_q, out_max); | ||
|
|
||
| prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov); | ||
| } | ||
|
|
||
| out = zebra_rib_dplane_results_count(); | ||
mjstapp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| vty_out(vty, "dataplane Outgoing Queue to Zebra: %" PRIu64 "\n", out); | ||
|
|
||
| return CMD_SUCCESS; | ||
| } | ||
|
|
||
|
|
@@ -6299,10 +6307,6 @@ struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx( | |
| dplane_provider_lock(prov); | ||
|
|
||
| ctx = dplane_ctx_list_pop(&(prov->dp_ctx_in_list)); | ||
| if (ctx) { | ||
| atomic_fetch_sub_explicit(&prov->dp_in_queued, 1, | ||
| memory_order_relaxed); | ||
| } | ||
|
|
||
| dplane_provider_unlock(prov); | ||
|
|
||
|
|
@@ -6330,10 +6334,6 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov, | |
| break; | ||
| } | ||
|
|
||
| if (ret > 0) | ||
| atomic_fetch_sub_explicit(&prov->dp_in_queued, ret, | ||
| memory_order_relaxed); | ||
|
|
||
| dplane_provider_unlock(prov); | ||
|
|
||
| return ret; | ||
|
|
@@ -6358,10 +6358,7 @@ void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov, | |
| dplane_ctx_list_add_tail(&(prov->dp_ctx_out_list), ctx); | ||
|
|
||
| /* Maintain out-queue counters */ | ||
| atomic_fetch_add_explicit(&(prov->dp_out_queued), 1, | ||
| memory_order_relaxed); | ||
| curr = atomic_load_explicit(&prov->dp_out_queued, | ||
| memory_order_relaxed); | ||
| curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list); | ||
| high = atomic_load_explicit(&prov->dp_out_max, | ||
| memory_order_relaxed); | ||
| if (curr > high) | ||
|
|
@@ -6383,9 +6380,6 @@ dplane_provider_dequeue_out_ctx(struct zebra_dplane_provider *prov) | |
| if (!ctx) | ||
| return NULL; | ||
|
|
||
| atomic_fetch_sub_explicit(&(prov->dp_out_queued), 1, | ||
| memory_order_relaxed); | ||
|
|
||
| return ctx; | ||
| } | ||
|
|
||
|
|
@@ -7331,10 +7325,10 @@ static void dplane_thread_loop(struct event *event) | |
| { | ||
| struct dplane_ctx_list_head work_list; | ||
| struct dplane_ctx_list_head error_list; | ||
| struct zebra_dplane_provider *prov; | ||
| struct zebra_dplane_provider *prov, *next_prov; | ||
| struct zebra_dplane_ctx *ctx; | ||
| int limit, counter, error_counter; | ||
| uint64_t curr, high; | ||
| uint64_t curr, out_curr, high; | ||
| bool reschedule = false; | ||
|
|
||
| /* Capture work limit per cycle */ | ||
|
|
@@ -7358,18 +7352,48 @@ static void dplane_thread_loop(struct event *event) | |
| /* Locate initial registered provider */ | ||
| prov = dplane_prov_list_first(&zdplane_info.dg_providers); | ||
|
|
||
| /* Move new work from incoming list to temp list */ | ||
| for (counter = 0; counter < limit; counter++) { | ||
| ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list); | ||
| if (ctx) { | ||
| ctx->zd_provider = prov->dp_id; | ||
| curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list); | ||
| out_curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list); | ||
|
|
||
| dplane_ctx_list_add_tail(&work_list, ctx); | ||
| } else { | ||
| break; | ||
| if (curr >= (uint64_t)limit) { | ||
| if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) | ||
| zlog_debug("%s: Current first provider(%s) Input queue is %" PRIu64 | ||
| ", holding off work", | ||
| __func__, prov->dp_name, curr); | ||
| counter = 0; | ||
| } else if (out_curr >= (uint64_t)limit) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can it be a situation where the input and output queue is hitting the limit?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah but it doesn't matter from my perspective. Both cases we do the same thing. It's sufficient to just state the first one encountered in a debug and move on. |
||
| if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) | ||
| zlog_debug("%s: Current first provider(%s) Output queue is %" PRIu64 | ||
| ", holding off work", | ||
| __func__, prov->dp_name, out_curr); | ||
| counter = 0; | ||
| } else { | ||
| int tlimit; | ||
| /* | ||
| * Let's limit the work to how what can be put on the | ||
| * in or out queue without going over | ||
| */ | ||
| tlimit = limit - MAX(curr, out_curr); | ||
| /* Move new work from incoming list to temp list */ | ||
| for (counter = 0; counter < tlimit; counter++) { | ||
| ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list); | ||
| if (ctx) { | ||
| ctx->zd_provider = prov->dp_id; | ||
|
|
||
| dplane_ctx_list_add_tail(&work_list, ctx); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /* | ||
| * If there is anything still on the two input queues reschedule | ||
| */ | ||
| if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 || | ||
| dplane_ctx_queue_count(&zdplane_info.dg_update_list) > 0) | ||
| reschedule = true; | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we drop this change?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. err. I intentionally added a newline to space things out a bit. So I am a bit confused |
||
| DPLANE_UNLOCK(); | ||
|
|
||
| atomic_fetch_sub_explicit(&zdplane_info.dg_routes_queued, counter, | ||
|
|
@@ -7388,8 +7412,9 @@ static void dplane_thread_loop(struct event *event) | |
| * items. | ||
| */ | ||
| if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) | ||
| zlog_debug("dplane enqueues %d new work to provider '%s'", | ||
| counter, dplane_provider_get_name(prov)); | ||
| zlog_debug("dplane enqueues %d new work to provider '%s' curr is %" PRIu64, | ||
| counter, dplane_provider_get_name(prov), | ||
| curr); | ||
|
|
||
| /* Capture current provider id in each context; check for | ||
| * error status. | ||
|
|
@@ -7422,10 +7447,7 @@ static void dplane_thread_loop(struct event *event) | |
|
|
||
| atomic_fetch_add_explicit(&prov->dp_in_counter, counter, | ||
| memory_order_relaxed); | ||
| atomic_fetch_add_explicit(&prov->dp_in_queued, counter, | ||
| memory_order_relaxed); | ||
| curr = atomic_load_explicit(&prov->dp_in_queued, | ||
| memory_order_relaxed); | ||
| curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list); | ||
| high = atomic_load_explicit(&prov->dp_in_max, | ||
| memory_order_relaxed); | ||
| if (curr > high) | ||
|
|
@@ -7450,18 +7472,61 @@ static void dplane_thread_loop(struct event *event) | |
| if (!zdplane_info.dg_run) | ||
| break; | ||
|
|
||
| /* Locate next provider */ | ||
| next_prov = dplane_prov_list_next(&zdplane_info.dg_providers, | ||
| prov); | ||
| if (next_prov) { | ||
| curr = dplane_ctx_queue_count( | ||
| &next_prov->dp_ctx_in_list); | ||
| out_curr = dplane_ctx_queue_count( | ||
| &next_prov->dp_ctx_out_list); | ||
| } else | ||
| out_curr = curr = 0; | ||
|
|
||
| /* Dequeue completed work from the provider */ | ||
| dplane_provider_lock(prov); | ||
|
|
||
| while (counter < limit) { | ||
| ctx = dplane_provider_dequeue_out_ctx(prov); | ||
| if (ctx) { | ||
| dplane_ctx_list_add_tail(&work_list, ctx); | ||
| counter++; | ||
| } else | ||
| break; | ||
| if (curr >= (uint64_t)limit) { | ||
| if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) | ||
| zlog_debug("%s: Next Provider(%s) Input queue is %" PRIu64 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO, printing
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are printing it? |
||
| ", holding off work", | ||
| __func__, next_prov->dp_name, curr); | ||
| counter = 0; | ||
| } else if (out_curr >= (uint64_t)limit) { | ||
| if (IS_ZEBRA_DEBUG_DPLANE_DETAIL) | ||
| zlog_debug("%s: Next Provider(%s) Output queue is %" PRIu64 | ||
| ", holding off work", | ||
| __func__, next_prov->dp_name, | ||
| out_curr); | ||
| counter = 0; | ||
| } else { | ||
| int tlimit; | ||
|
|
||
| /* | ||
| * Let's limit the work to how what can be put on the | ||
| * in or out queue without going over | ||
| */ | ||
| tlimit = limit - MAX(curr, out_curr); | ||
| while (counter < tlimit) { | ||
| ctx = dplane_provider_dequeue_out_ctx(prov); | ||
| if (ctx) { | ||
| dplane_ctx_list_add_tail(&work_list, | ||
| ctx); | ||
| counter++; | ||
| } else | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| /* | ||
| * Let's check if there are still any items on the | ||
| * input or output queus of the current provider | ||
| * if so then we know we need to reschedule. | ||
| */ | ||
| if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 || | ||
| dplane_ctx_queue_count(&prov->dp_ctx_out_list) > 0) | ||
| reschedule = true; | ||
|
|
||
| dplane_provider_unlock(prov); | ||
|
|
||
| if (counter >= limit) | ||
|
|
@@ -7477,7 +7542,7 @@ static void dplane_thread_loop(struct event *event) | |
| } | ||
|
|
||
| /* Locate next provider */ | ||
| prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov); | ||
| prov = next_prov; | ||
| } | ||
|
|
||
| /* | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.