diff --git a/.github/workflows/dash-bmv2-bldr-docker-acr.yml b/.github/workflows/dash-bmv2-bldr-docker-acr.yml index 228581639..609aea814 100644 --- a/.github/workflows/dash-bmv2-bldr-docker-acr.yml +++ b/.github/workflows/dash-bmv2-bldr-docker-acr.yml @@ -35,4 +35,4 @@ jobs: username: ${{ secrets.DASH_ACR_USERNAME }} password: ${{ secrets.DASH_ACR_PASSWORD }} - name: Publish dash-bmv2-bldr image - run: docker push sonicdash.azurecr.io/dash-bmv2-bldr:220630 \ No newline at end of file + run: make docker-publish-bmv2-bldr diff --git a/.github/workflows/dash-grpc1.43.2-docker-acr.yml b/.github/workflows/dash-grpc1.43.2-docker-acr.yml index 9ff9b0497..67a4e6c93 100644 --- a/.github/workflows/dash-grpc1.43.2-docker-acr.yml +++ b/.github/workflows/dash-grpc1.43.2-docker-acr.yml @@ -34,5 +34,5 @@ jobs: login-server: ${{ secrets.DASH_ACR_LOGIN_SERVER }} username: ${{ secrets.DASH_ACR_USERNAME }} password: ${{ secrets.DASH_ACR_PASSWORD }} - - name: Publish dash-bmv2 docker image - run: docker push sonicdash.azurecr.io/dash-grpc:1.43.2 + - name: Publish dash-grpc1.43.2 image + run: make docker-publish-dash-grpc diff --git a/.github/workflows/dash-md-spellcheck.yml b/.github/workflows/dash-md-spellcheck.yml index 5cc9bed4e..9b73265f8 100644 --- a/.github/workflows/dash-md-spellcheck.yml +++ b/.github/workflows/dash-md-spellcheck.yml @@ -3,9 +3,11 @@ on: pull_request: paths: - '**/*.md' + - '.wordlist.txt' push: paths: - '**/*.md' + - '.wordlist.txt' workflow_dispatch: jobs: diff --git a/.github/workflows/dash-p4c-bmv2-docker-acr.yml b/.github/workflows/dash-p4c-bmv2-docker-acr.yml index b5cfc9692..f89ed4874 100644 --- a/.github/workflows/dash-p4c-bmv2-docker-acr.yml +++ b/.github/workflows/dash-p4c-bmv2-docker-acr.yml @@ -35,4 +35,4 @@ jobs: username: ${{ secrets.DASH_ACR_USERNAME }} password: ${{ secrets.DASH_ACR_PASSWORD }} - name: Publish dash-p4c-bmv2 docker image to ACR - run: docker push sonicdash.azurecr.io/dash-p4c-bmv2:220819 + run: make docker-publish-dash-p4c diff --git a/.github/workflows/dash-saithrift-bldr-docker-acr.yml b/.github/workflows/dash-saithrift-bldr-docker-acr.yml index ccea91c19..d268b1d8f 100644 --- a/.github/workflows/dash-saithrift-bldr-docker-acr.yml +++ b/.github/workflows/dash-saithrift-bldr-docker-acr.yml @@ -41,4 +41,4 @@ jobs: - name: Build dash-saithrift-bldr image run: DOCKER_FLAGS=$docker_fg_flags make docker-saithrift-bldr - name: Publish dash-saithrift-bldr docker image - run: docker push sonicdash.azurecr.io/dash-saithrift-bldr:220719 + run: make docker-publish-saithrift-bldr diff --git a/.github/workflows/dash-saithrift-client-bldr-docker-acr.yml b/.github/workflows/dash-saithrift-client-bldr-docker-acr.yml index 2a5bda222..11c3b7bd9 100644 --- a/.github/workflows/dash-saithrift-client-bldr-docker-acr.yml +++ b/.github/workflows/dash-saithrift-client-bldr-docker-acr.yml @@ -51,4 +51,4 @@ jobs: - name: Build dash-saithrift-client-bldr image run: DOCKER_FLAGS=$docker_fg_flags make docker-saithrift-client-bldr - name: Publish dash-saithrift-client-bldr docker image - run: docker push sonicdash.azurecr.io/dash-saithrift-client-bldr:220723 + run: make docker-publish-saithrift-client-bldr diff --git a/.github/workflows/dash-saithrift-docker.yml b/.github/workflows/dash-saithrift-docker.yml deleted file mode 100644 index 774807431..000000000 --- a/.github/workflows/dash-saithrift-docker.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: DASH-docker-saithrift-bldr-build-image - -on: - push: - branches: [ "**" ] - paths: - - '.github/workflows/dash-saithrift-docker.yml' - - 'dash-pipeline/dockerfiles/Dockerfile.saithrift-bldr' - - 'dash-pipeline/.dockerignore' - - 'dash-pipeline/dockerfiles/.dockerignore' - pull_request: - branches: [ "main" ] - paths: - - '.github/workflows/dash-saithrift-docker.yml' - - 'dash-pipeline/dockerfiles/Dockerfile.saithrift-bldr' - - 'dash-pipeline/.dockerignore' - - 'dash-pipeline/dockerfiles/.dockerignore' - workflow_dispatch: - -jobs: - build: - name: Build docker dash-saithrift-bldr image - runs-on: ubuntu-20.04 - defaults: - run: - working-directory: ./dash-pipeline - steps: - - uses: actions/checkout@v3 - - name: Build dash-saithrift docker image - run: make docker-saithrift-bldr - - name: Publish dash-saithrift docker image - run: make docker-publish-saithrift-bldr \ No newline at end of file diff --git a/.wordlist.txt b/.wordlist.txt index b1f41e5dc..b5aa7629d 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -5,11 +5,14 @@ Accton ACK Ack ack +acl ACL ACLs ACR adaptor adaptors +addr +ADDR agnostically amd apache @@ -46,30 +49,40 @@ bm BMV bmv BMv +bootup +Bootup br breakpoint Bringup bruh buildimage +BulkSyncDone BW bw bz callout cd centos +Cfg +cfg checkboxes chris +chrispsommers ci CLA cla CLI CloudStorm +CNIP codebase +compat +Compat conf CONFDB confgen config Containerlab +CP CreatedHalfOpenFlow CreatedOtherFlow CreatedTcpFlow @@ -90,6 +103,7 @@ CurrentUdpFlow customizable Cx cyberithub +dashsubmodule DASHOrch dashorch DashOrch @@ -98,6 +112,11 @@ DataCenter datagram Datagram datagrams +datapath +Datapath +dataplane +Dataplane +dataplanes datastore DBs DDoS @@ -110,6 +129,7 @@ DEST dest dev DHCP +dir Disaggregated Disaggregation distro @@ -123,12 +143,14 @@ Dockerfiles dockerfiles dockerhub Dockerhub +dockerized DoS DotNet downcasting DPDK DPU dpu +DPUControlMsgs DPUs drawio drilldown @@ -149,6 +171,7 @@ DroppedRedirectPackets DroppedResourcesMemory DroppedResourcesPacket DroppedResourcesUnifiedFlow +DSC DSCP dst DuringPortTimer @@ -199,6 +222,7 @@ gRPC grpc guid GW +HB HD HLD hld @@ -207,6 +231,7 @@ hoc HOSTIF HSL https +hw HW ICMP idempotency @@ -219,6 +244,7 @@ InbfromLB INIT Init initializer +integrations integrators interoperable io @@ -255,6 +281,7 @@ kvm lang libsai linux +liveness LLDP lldp loadbalancer @@ -263,8 +290,10 @@ loopback LPM lts Macsec +makefile Makefile Makefiles +makefiles MatchedHalfOpenFlow MatchedOtherFlow MatchedTcpFlow @@ -302,6 +331,7 @@ Novus NPUS NSG NSGs +num NumberOfFlowResimulated NVA NVidia @@ -318,6 +348,7 @@ OpenConfig openconfig opensource OpenTrafficGenerator +oper OPER orch OrchAgent @@ -336,6 +367,7 @@ params PAs PCI PCIe +Pensando performant pingmesh PIR @@ -355,6 +387,7 @@ Prem preprocessor preprogrammed prereq +Pritsak PrivateAddress programmability protobuf @@ -371,7 +404,9 @@ pytests Pyunit qcow QoS +qos Radv +rdpty reachability README READMEs @@ -384,6 +419,7 @@ repos resimulation responder Resttapi +retransmission retransmit retransmitted reviewable @@ -420,6 +456,7 @@ sdn Sflow SHA sharding +SInce SKU SKUs SLB @@ -443,6 +480,8 @@ SRC src STATEDB stateful +statemachine +struct subclassed subdirectories subdirectory @@ -457,10 +496,15 @@ subtype supportability SUT svg +sw +SwitchoverDone +switchovers SWSS swss SynAck syncd +synched +synching TBD tbd TCP @@ -519,9 +563,11 @@ unpair Unpair untracked upcasting +upstreaming vcpus veth VFP +vip virsh virt virtio @@ -554,6 +600,7 @@ vPORT VPorts VTEP VTEPs +VxLAN VXLAN vxlan warmboots diff --git a/dash-pipeline/Makefile b/dash-pipeline/Makefile index a50723068..c5d2ed870 100644 --- a/dash-pipeline/Makefile +++ b/dash-pipeline/Makefile @@ -296,6 +296,12 @@ docker-bmv2-bldr: docker-pull-bmv2-bldr: docker pull $(DOCKER_BMV2_BLDR_IMG) + + +docker-publish-bmv2-bldr: + @echo "Publish $(DOCKER_BMV2_BLDR_IMG) - requires credentials, can only do from DASH repo, not a fork" + docker push $(DOCKER_BMV2_BLDR_IMG) + ############################### docker-saithrift-bldr: docker build \ @@ -311,6 +317,10 @@ docker-saithrift-bldr: docker-pull-saithrift-bldr: docker pull $(DOCKER_SAITHRIFT_BLDR_IMG) +docker-publish-saithrift-bldr: + @echo "Publish $(DOCKER_SAITHRIFT_BLDR_IMG) - requires credentials, can only do from DASH repo, not a fork" + docker push $(DOCKER_SAITHRIFT_BLDR_IMG) + ############################### # Builder, has base packages to make client docker docker-saithrift-client-bldr: @@ -327,6 +337,10 @@ docker-saithrift-client-bldr: docker-pull-saithrift-client-bldr: docker pull $(DOCKER_SAITHRIFT_CLIENT_BLDR_IMG) +docker-publish-saithrift-client-bldr: + @echo "Publish $(DOCKER_SAITHRIFT_CLIENT_BLDR_IMG) - requires credentials, can only do from DASH repo, not a fork" + docker push $(DOCKER_SAITHRIFT_CLIENT_BLDR_IMG) + ############################### # Client image, rebuild any time SAI interface changes @@ -400,6 +414,10 @@ docker-dash-p4c: docker-pull-dash-p4c: docker pull $(DOCKER_P4C_BMV2_IMG) +docker-publish-dash-p4c: + @echo "Publish $(DOCKER_P4C_BMV2_IMG) - requires credentials, can only do from DASH repo, not a fork" + docker push $(DOCKER_P4C_BMV2_IMG) + ############################### docker-dash-grpc: @@ -416,6 +434,10 @@ docker-dash-grpc: docker-pull-dash-grpc: docker pull $(DOCKER_GRPC_IMG) +docker-publish-dash-grpc: + @echo "Publish $(DOCKER_GRPC_IMG) - requires credentials, can only do from DASH repo, not a fork" + docker push $(DOCKER_GRPC_IMG) + ############################### # BMV2-PKTGEN NETWORKING TARGETS ############################### diff --git a/dash-pipeline/Makefile.3rdpty b/dash-pipeline/Makefile.3rdpty new file mode 100644 index 000000000..684f75572 --- /dev/null +++ b/dash-pipeline/Makefile.3rdpty @@ -0,0 +1,66 @@ +all: dash-p4 sai thirdparty-saithrift-server docker-saithrift-client + +clean: dash-p4-clean dash-sai-clean + +# Submodule location relative to this Makefile +DASHDIR ?=DASH + +DASH: dash-submodule + +.PHONY: dash-submodule +dash-submodule: + @echo "Initializing DASH submodule..." + git submodule update --init DASH + +# Build entire dash-pipeline codebase as sanity check +dash-pipeline-regression: DASH + $(MAKE) -C $(DASHDIR)/dash-pipeline clean + $(MAKE) -C $(DASHDIR)/dash-pipeline all + +dash-pipeline-clean: DASH + $(MAKE) -C $(DASHDIR)/dash-pipeline clean + +.PHONY: sai +sai: dash-sai-clean dash-sai-headers dash-sai-meta thirdparty-libsai + + +# Build behavioral model code, needed for SAI headers +dash-p4: + $(MAKE) -C $(DASHDIR)/dash-pipeline p4 + +dash-p4-clean: + $(MAKE) -C $(DASHDIR)/dash-pipeline p4-clean + +dash-sai-clean: + $(MAKE) -C $(DASHDIR)/dash-pipeline sai-clean + +# Autogenerate SAI headers +dash-sai-headers: + $(MAKE) -C $(DASHDIR)/dash-pipeline sai-headers + +# Autogenerate SAI meta, needed for saithrift client/server +dash-sai-meta: + $(MAKE) -C $(DASHDIR)/dash-pipeline sai-meta + +# Implementation-dependent libsai library +thirdparty-libsai: + @echo "Build third-pary libsai" + @echo " Put libsai.so under $(DASHDIR)/dash-pipeline/SAI/lib" + @echo " For use by saithrift-server build stage." + +thirdparty-saithrift-server: + @echo "Build third-party saithrift-server" + @echo " Expects libsai.so under $(DASHDIR)/dash-pipeline/SAI/lib" + # For reference: + # $(MAKE) -C $(DASHDIR)/dash-pipeline saithrift-server + +docker-saithrift-client: + @echo "Build third-pary saithrift-client" + @echo " Expects saithrift-server already built" + # Uncomment when saithrift server can be built + # $(MAKE) -C $(DASHDIR)/dash-pipeline docker-saithrift-client + +run-all-tests: + # Uncomment when saithrift client & server can be built + # Can add more custom tests in addition to DASH tests + # make -C $(DASHDIR)/dash-pipeline run-all-tests diff --git a/dash-pipeline/README-dash-as-submodule.md b/dash-pipeline/README-dash-as-submodule.md new file mode 100644 index 000000000..291906634 --- /dev/null +++ b/dash-pipeline/README-dash-as-submodule.md @@ -0,0 +1,146 @@ +**I want a [Quick-Start](#quick-start)!** + +**Table of Contents** + +- [Importing the DASH project into another project](#importing-the-dash-project-into-another-project) +- [Quick-Start](#quick-start) +- [How to use DASH as a Git Submodule](#how-to-use-dash-as-a-git-submodule) +- [Third-Party Workflow & DASH Workflow Reuse](#third-party-workflow--dash-workflow-reuse) + - [Recap: DASH bmv2 workflow](#recap-dash-bmv2-workflow) + - [Custom DASH Workflow](#custom-dash-workflow) + - [Reusable Build toolchain and artifacts](#reusable-build-toolchain-and-artifacts) + - [Required Custom Tools and Artifacts](#required-custom-tools-and-artifacts) + - [Custom Traffic Test Harness](#custom-traffic-test-harness) + - [Custom Tests](#custom-tests) + - [Third-Party CI Pipeline Automation (Git Actions)](#third-party-ci-pipeline-automation-git-actions) +# Importing the DASH project into another project + +The [Azure/DASH project](https://github.com/Azure/DASH) can be used as a resource within other projects, such as third-party, commercial or open-source DASH implementations. For example, a commercial DPU vendor can incorporate the DASH project into a private Git repository and utilize many of the components, providing consistency with the community implementation and definition, reusing test-cases, and avoiding duplication of efforts. + +# Quick-Start +This will show you how to import the [Azure/DASH project](https://github.com/Azure/DASH) project into your own Git project. + +A minimal sample project created using this recipe can be found here: https://github.com/chrispsommers/dashsubmodule + + +1. Start with a Git project, either a new or existing one of your choosing. You might want to make a scratch project just to try this out. +2. Copy the [Makefile.3rdpty](Makefile.3rdpty) into your project. You can put it into its own subdirectory and/or or rename it to suit. If you rename it, please interpret the subsequent instructions accordingly. +3. Choose a subdirectory in which to import the DASH project as a submodule. The sample [Makefile.3rdpty](Makefile.3rdpty) assumes a directory `./DASH` relative to the Makefile location. Edit the following line in the makefile to change this (or, set the environment variable `DASHDIR` before calling `make`): + ``` + DASHDIR ?=DASH + ``` +4. Import the DASH repository as a submodule using the following command. Modify the final parameter to match the relative directory in your project where you want the submodule to be cloned into: + ``` + git submodule add -b main --name DASH [git@github.com:Azure/DASH.git](https://github.com/Azure/DASH.git) DASH + ``` +5. Commit the changes now or later, see [DASH as a Git Submodule](#dash-as-a-git-submodule) +6. To verify the dash submodule was imported correctly and crucial steps function properly, execute the following. (If you rename the file to `Makefile` you can omit the `-f` option.) + ``` + make [-f Makefile.3rdpty] clean + make [-f Makefile.3rdpty] all + ``` + This will run selected build steps from DASH - those which don't depend upon third-party implementations. This includes compiling the P4 code, generating SAI headers, and also pulls in several Docker images. It's a great starting point. See the detailed descriptions elsewhere in this document for next steps. +7. *OPTIONAL:* To perform the entire dash-pipeline build process, execute the following: + ``` + make [-f Makefile.3rdpty] dash-pipeline-regression + ``` + This will run `make clean && make all` from the dash project. You don't *have* to do this since many of the artifacts are irrelevant for third-party adaptations. +8. *OPTIONAL:* You can also `cd /dash-pipeline` and run any of the steps outlined in the DASH bmv2 [workflows](README-dash-workflows.md), such as the following. This has the benefit of verifying the function of SW traffic generators etc. in your environment. You can use this to confirm functional tests against the reference implementation. + ``` + make run-switch # console 1 + make run-saithrift-server # console 2 + make run-all-tests # console 3 + +# How to use DASH as a Git Submodule +A third-party project can import the DASH project as a Git Submodule. See [about-git-submodules](README-dash-workflows.md#about-git-submodules) for background. In this example, DASH is imported at the top-level of the project using the following command. (See the documentation for `git submodule add` for other options.) + +``` +git submodule add -b main --name DASH [git@github.com:Azure/DASH.git](https://github.com/Azure/DASH.git) DASH +``` + +The effects of this command are: +- Clone the DASH repository in-place under the directory `DASH` (relative to the working directory of the command). +- Make an entry in the `.gitmodules` file (creating it if needed). For example: + ``` + [submodule "DASH"] + path = DASH + url = git@github.com:Azure/DASH.git + branch = main + ``` +- Store the imported repository git index/database under the parent project's `.git/modules` directory + +Importing the submodule also creates new items for `DASH` and `.gitmodules` which need to be committed. For example: +``` +chris@chris-z4:~/dashsubmodule$ git status +On branch main +Your branch is up to date with 'origin/main'. + +Changes to be committed: + (use "git restore --staged ..." to unstage) + new file: .gitmodules + new file: DASH + +Changes not staged for commit: + (use "git add ..." to update what will be committed) + (use "git restore ..." to discard changes in working directory) + (commit or discard the untracked or modified content in submodules) + modified: DASH (modified content, untracked content) +``` +To commit to your project: +``` +git add .gitmodules DASH +git commit +[git push] +``` +The resulting Git structure is as follows. DASH is imported as a submodule. Furthermore, the DASH project itself contains multiple levels of submodules. Via `git submodule update --init` instructions in `DASH/dash-pipeline/Makefile`) these repositories are cloned-in-place. + +![Git Hierarchy](images/dash-submodule-git-hierarchy.svg) + +# Third-Party Workflow & DASH Workflow Reuse +## Recap: DASH bmv2 workflow +The figure below shows the traditional bmv2-based workflow and is described in [README-dash-workflows](README-dash-workflows.md). + +![dash-p4-bmv2-thrift-workflow](https://github.com/Azure/DASH/raw/main/dash-pipeline/images/dash-p4-bmv2-thrift-workflow.svg +) +## Custom DASH Workflow +The reference project contains a `Makefile.3rdpty` to serve as a starting point. It has make targets which are just wrappers to invoke predefined Makefile targets in the DASH repository (e.g. using `make -C...`). It also has placeholder make targets where third-party customization is required. You can modify it arbitrarily. The intent was to reuse as much as possible from DASH. + +The drawing below shows where third-party customization will be needed, using "exciting" colors. + +![Custom Dash Workflow](images/dash-submodule-workflow.svg) + +The main objective is to re-use DASH artifacts, Makefiles, Dockerfiles, etc. where possible and replace (or augment) certain resources by third-party implementations. + +## Reusable Build toolchain and artifacts + +The following toolchains and output artifacts *should* be reusable as-is from the DASH project, with no (or very few) modifications: +* dash-pipeline P4 source code (for SAI header generation) +* P4 behavioral Model code compilation. The primary artifact of interest is just the P4Info file used to auto-generate the SAI headers for overlay services. Both the dockerized `dash-p4c-bmv2` container and the output artifacts should be reusable as-is from DASH. +* SAI experimental headers describing the interface to the dataplane, derived from P4Info. A code generator script [SAI/sai_api_gen.py](SAI/sai_api_gen.py) produces SAI headers derived from the P4 code, emitted into [SAI/SAI/experimental](SAI/SAI/experimental). It also generates a SAI-to-P4Runtime adaptor layer emitted into [SAI/lib](SAI/lib), for the bmv2 implementation.Third-party workflows can ignore the bmv2 adaptor layer or use it as inspiration. +* SAI metadata derived from the combination of standard SAI headers and the DASH headers. This is done by makefiles and scripts inside the SAI submodule and also uses the `dash-saithrift-bldr` container. +* `saithrift-client-bldr` base docker container, retrieved from a docker registry and built in a standard way. It contains base tools and packages. +* `dash-saithrift-client` docker container which includes all tools and artifacts needed to perform dataplane tests. The artifacts are generated based on the outputs of the saithrift-server build step (below) which might need third-party customization as described below. + >**Note:** The community dash-pipeline bmv2 build workflows assume that saithrift-server is built first, then saithrift-client is built next. In principle the saithrift client is target-agnostic and should not depend upon the saithrift server, but the build process for saithrift client and server are somewhat combined. To make a saithrift client for a third-party implementation *without* depending upon third-party saithrift server (which depends on third-party `libsai`), just use the dash-pipeline `make all` and use the resulting saithrift-client docker image. + +## Required Custom Tools and Artifacts +The following will undoubtedly be developed uniquely for each DASH implementation: +* DASH Dataplane - this is the primary focus of third-party DASH implementations and can be any mix of hardware and/or software. +* SAI adaptor layer to translate SAI API calls into the underlying dataplane configuration "SDK." You might want to adapt the code generator [SAI/sai_api_gen.py](SAI/sai_api_gen.py) to produce your own adaptor layer, or a skeleton thereof. +* saithrift-server [Makefile](SAI/saithrift/Makefile) and `dash-saithrift-bldr` container to run the saithrift code generator and link to third-party `libsai.so`. + + >**Note:** this might require significant third-party customizations to compile for certain archtectures. Default is Ubuntu 20.04 running on an x86 "device." Some implementations, e.g. the bmv2/P4Runtime implementation, use an RPC between the SAI adaptor layer and the underlying device "SDK," which means the saithrift server *could* run on one processor while the dataplane and device SDK run in a DPU with a different achitecture. For example, if a third-party dataplane has a native RPC such as gRPC, it could serve the same role as the P4Runtime API in the community bmv2 archtecture. If so, then the saithrift server could be compiled using the community workflow; presumably the custom `libsai` would translate SAI calls into native third-party gRPC calls, which means the saithrift server runs in a different process than the dataplane/native gRPC server. The [SAI/saithrift/Makefile](SAI/saithrift/Makefile) will probably need modifications to pass in different `SAIRPC_EXTRA_LIBS` at the minimum. + +## Custom Traffic Test Harness +The community DASH bmv2 test workflow includes SW traffic-generators connected to the SW dataplane via `veth` ports. Third-party integrations can continue to use this method, or others, including: +* Using physical NIC devices driven by SW traffic generators, cabled to SW dataplanes bound to other physical NIC ports (high-performance SW implementations relying on NIC devices) +* HW implementations of DASH dataplanes, e.g. "real DPUs" or physical emulations thereof, cabled to SW traffic generators which are bound to physical NIC ports. +* HW or SW DASH dataplane implementations cabled to HW-based traffic generators such as IXIA chassis etc. + +If test ports other than `veth0/1` and `veth2/3` are used, some modifications of setup scripts may be required: +* PTF tests using scapy for SW traffic generation can be parameterized to specify logical-to-physical port mappings. +* Pytests using ixia-c SW traffic generator are set up using docker-compose topology files under [DASH/test/third-party/traffic_gen/deployment](https://github.com/Azure/DASH/tree/main/test/third-party/traffic_gen/deployment) + +## Custom Tests +You can use the tests under DASH by calling the appropriate DASH make targets from the parent project. You can also have private tests in your own project repository which you invoke from your Makefiles. We recommend if you write new tests which are generally applicable that you consider upstreaming to the Community repository. +## Third-Party CI Pipeline Automation (Git Actions) +You should be able to adapt the CI automation files from the dash project as located under [.github/workflows](../.github/workflows). You will need to modify them to suit your project by changing the trigger conditions (e.g. file system paths) and steps. \ No newline at end of file diff --git a/dash-pipeline/SAI/sai_api_gen.py b/dash-pipeline/SAI/sai_api_gen.py index 3c77f27ca..062592963 100755 --- a/dash-pipeline/SAI/sai_api_gen.py +++ b/dash-pipeline/SAI/sai_api_gen.py @@ -271,7 +271,7 @@ def write_sai_impl_files(sai_api): with open('./lib/sai' + sai_api['app_name'].replace('_', '') + '.cpp', 'w') as o: o.write(sai_impl_str) -def write_sai_makefile(sai_api_name_list): +def write_sai_makefile(sai_api_name_list, sai_api_full_name_list): env = Environment(loader=FileSystemLoader('.')) makefile_tm = env.get_template('/templates/Makefile.j2') makefile_str = makefile_tm.render(api_names = sai_api_name_list) @@ -281,7 +281,7 @@ def write_sai_makefile(sai_api_name_list): env = Environment(loader=FileSystemLoader('.'), trim_blocks=True, lstrip_blocks=True) sai_impl_tm = env.get_template('/templates/utils.cpp.j2') - sai_impl_str = sai_impl_tm.render(tables = sai_api[TABLES_TAG], app_name = sai_api['app_name']) + sai_impl_str = sai_impl_tm.render(tables = sai_api[TABLES_TAG], app_name = sai_api['app_name'], api_names = sai_api_full_name_list) with open('./lib/utils.cpp', 'w') as o: o.write(sai_impl_str) @@ -393,6 +393,7 @@ def write_sai_files(sai_api): sai_apis, all_table_names = generate_sai_apis(json_program, args.ignore_tables.split(',')) sai_api_name_list = [] +sai_api_full_name_list = [] for sai_api in sai_apis: # Update object name reference for action params for table in sai_api[TABLES_TAG]: @@ -415,8 +416,9 @@ def write_sai_files(sai_api): write_sai_files(sai_api) write_sai_impl_files(sai_api) sai_api_name_list.append(sai_api['app_name'].replace('_', '')) + sai_api_full_name_list.append(sai_api['app_name']) -write_sai_makefile(sai_api_name_list) +write_sai_makefile(sai_api_name_list, sai_api_full_name_list) if args.print_sai_lib: print(json.dumps(sai_api, indent=2)) diff --git a/dash-pipeline/SAI/templates/saiapi.h.j2 b/dash-pipeline/SAI/templates/saiapi.h.j2 index 6506f68ae..c918cea7d 100644 --- a/dash-pipeline/SAI/templates/saiapi.h.j2 +++ b/dash-pipeline/SAI/templates/saiapi.h.j2 @@ -34,7 +34,7 @@ */ {% for table in sai_api.tables %} -{% if table.actions | length > 1 %} +{% if table.actions | length > 1 or ((table.actions | length == 1) and (((table.is_object == 'false') or (table['keys'] | length <= 1)) and ((table['actionParams'] | length == 0)))) %} /** * @brief Attribute data for #SAI_{{ table.name | upper }}_ATTR_ACTION */ @@ -113,7 +113,7 @@ typedef enum _sai_{{ table.name }}_attr_t SAI_{{ table.name | upper }}_ATTR_START, {% set ns = namespace(firstattr=false) %} -{% if table.actions | length > 1 %} +{% if table.actions | length > 1 or ((table.actions | length == 1) and (((table.is_object == 'false') or (table['keys'] | length <= 1)) and ((table['actionParams'] | length == 0)))) %} /** * @brief Action * diff --git a/dash-pipeline/SAI/templates/utils.cpp.j2 b/dash-pipeline/SAI/templates/utils.cpp.j2 index debb29d96..683182933 100644 --- a/dash-pipeline/SAI/templates/utils.cpp.j2 +++ b/dash-pipeline/SAI/templates/utils.cpp.j2 @@ -244,13 +244,11 @@ sai_switch_api_t sai_switch_api_impl = { }; -/* TODO [cs] This should be auto-generated or part of per-API, auto-generated include file */ -extern sai_dash_api_t sai_dash_api_impl; -extern sai_dash_vnet_api_t sai_dash_vnet_api_impl; -extern sai_dash_acl_api_t sai_dash_acl_api_impl; +{% for api in api_names %} +extern sai_{{ api }}_api_t sai_{{ api }}_api_impl; +{% endfor %} -/* TODO [cs] This should be auto-generated */ sai_status_t sai_api_query( _In_ sai_api_t api, _Out_ void **api_method_table) { @@ -260,18 +258,12 @@ sai_status_t sai_api_query( *api_method_table = (void *)&sai_switch_api_impl; break; - case SAI_API_DASH: - *api_method_table = (void *)&sai_dash_api_impl; +{% for api in api_names %} + case SAI_API_{{ api | upper }}: + *api_method_table = (void *)&sai_{{ api }}_api_impl; break; - case SAI_API_DASH_ACL: - *api_method_table = (void *)&sai_dash_acl_api_impl; - break; - - case SAI_API_DASH_VNET: - *api_method_table = (void *)&sai_dash_vnet_api_impl; - break; - +{% endfor %} default: return SAI_STATUS_NOT_SUPPORTED; diff --git a/dash-pipeline/bmv2/dash_outbound.p4 b/dash-pipeline/bmv2/dash_outbound.p4 index 59e7c0292..8bec5a5f0 100644 --- a/dash-pipeline/bmv2/dash_outbound.p4 +++ b/dash-pipeline/bmv2/dash_outbound.p4 @@ -31,7 +31,7 @@ control outbound(inout headers_t hdr, direct_counter(CounterType.packets_and_bytes) routing_counter; - @name("outbound_routing|dash_vnet") + @name("outbound_routing|dash_outbound_routing") table routing { key = { meta.eni_id : exact @name("meta.eni_id:eni_id"); @@ -61,7 +61,7 @@ control outbound(inout headers_t hdr, direct_counter(CounterType.packets_and_bytes) ca_to_pa_counter; - @name("outbound_ca_to_pa|dash_vnet") + @name("outbound_ca_to_pa|dash_outbound_ca_to_pa") table ca_to_pa { key = { /* Flow for express route */ diff --git a/dash-pipeline/bmv2/dash_pipeline.p4 b/dash-pipeline/bmv2/dash_pipeline.p4 index 2bb754182..f138c6c8b 100644 --- a/dash-pipeline/bmv2/dash_pipeline.p4 +++ b/dash-pipeline/bmv2/dash_pipeline.p4 @@ -35,7 +35,7 @@ control dash_ingress(inout headers_t hdr, action accept() { } - @name("vip|dash") + @name("vip|dash_vip") table vip { key = { hdr.ipv4.dst_addr : exact @name("hdr.ipv4.dst_addr:VIP"); @@ -43,7 +43,7 @@ control dash_ingress(inout headers_t hdr, actions = { accept; - deny; + @defaultonly deny; } const default_action = deny; @@ -53,7 +53,7 @@ control dash_ingress(inout headers_t hdr, meta.direction = direction_t.OUTBOUND; } - @name("direction_lookup|dash") + @name("direction_lookup|dash_direction_lookup") table direction_lookup { key = { hdr.vxlan.vni : exact @name("hdr.vxlan.vni:VNI"); @@ -132,7 +132,7 @@ control dash_ingress(inout headers_t hdr, } } - @name("eni|dash") + @name("eni|dash_eni") table eni { key = { meta.eni_id : exact @name("meta.eni_id:eni_id"); @@ -166,7 +166,7 @@ control dash_ingress(inout headers_t hdr, meta.vnet_id = src_vnet_id; } - @name("pa_validation|dash_vnet") + @name("pa_validation|dash_pa_validation") table pa_validation { key = { meta.vnet_id: exact @name("meta.vnet_id:vnet_id"); @@ -175,13 +175,13 @@ control dash_ingress(inout headers_t hdr, actions = { permit; - deny; + @defaultonly deny; } const default_action = deny; } - @name("inbound_routing|dash_vnet") + @name("inbound_routing|dash_inbound_routing") table inbound_routing { key = { meta.eni_id: exact @name("meta.eni_id:eni_id"); @@ -201,7 +201,7 @@ control dash_ingress(inout headers_t hdr, meta.eni_id = eni_id; } - @name("eni_ether_address_map|dash") + @name("eni_ether_address_map|dash_eni") table eni_ether_address_map { key = { meta.eni_addr : exact @name("meta.eni_addr:address"); diff --git a/dash-pipeline/images/dash-submodule-git-hierarchy.svg b/dash-pipeline/images/dash-submodule-git-hierarchy.svg new file mode 100644 index 000000000..8c3716e11 --- /dev/null +++ b/dash-pipeline/images/dash-submodule-git-hierarchy.svg @@ -0,0 +1,4 @@ + + + +
Git
Git
github.com/parent-project
github.com/parent-project
Git
Git
github.com/Azure/DASH
github.com/Azure/DASH
Git
Git
github.com/opencomputeproject/SAI
github.com/opencomputeproject/SAI
/DASH
/DASH
./dash-pipeline/SAI/SAI
./dash-pipeline/SAI/S...
Git Repos
Git Repos
Filesystem directories
Filesystem...
DASH  submodule directly imported into "parent-project
DASH  submodule directly i...
Git
Git
github.com/p4lang/ptf
github.com/p4lang/ptf
./test/ptf
./test/...
Git submodules used by DASH project itself, indirectly imported into "parent-project."
Git submodules used by DASH pr...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/dash-pipeline/images/dash-submodule-workflow.svg b/dash-pipeline/images/dash-submodule-workflow.svg new file mode 100644 index 000000000..96de7a3f6 --- /dev/null +++ b/dash-pipeline/images/dash-submodule-workflow.svg @@ -0,0 +1,4 @@ + + + +
ixia-c
ixia-c
SAI
Implementation
?
SAI...
make  sai-thrift-client
make  sai-thrift-client
DASH P4
behavioral model 
(source of truth)
DASH P4...
Standard OCP SAI
header files subset
(underlay)
Standard OCP SAI...
DASH SAI
header files
(overrlay)
DASH SAI...
Saithrift code
generator
Saithrift code...
Thrift server
skeleton C++ code
Thrift server...
opencompute/SAI
opencompute/SAI
make P4
make P4
DASH/dash-pipeline
DASH/dash-pipeline
Generate SAI headers
Generate SAI headers
Git
Git
dash_pipeline.json
dash_pipeline.json
generate_dash_api.sh
generate_dash_api.sh
Containers provide the build & run environment:
Containers provide the b...
make docker-XXX
make docker-XXX
make sai
make sai
meta/make
meta/make
SAI implementation code
SAI implementation code
meta/gensairpc.pl
meta/gensairpc.pl
saithrift
server
saithrift...
libsai
libsai
p4c
p4c
make sai
make sai
make <target>
make <target>
LEGEND
LEGEND
make target or script in dash-pipeline
make target or script in dash-pipeline
make <target>
make <target>
make target or script in another repo (e.g. SAI/meta)
make target or script in another repo (e.g. SAI/meta)
SAI & meta headers
SAI & meta headers
Resource comes from external repo (resources assumed to be in this repo otherwise)
Resource comes from external repo (resources assumed to be in this repo otherwi...
Test scripts:
PTF, Pytest
built into container
Test scripts:...
Runtime socket communications (RPC commands or test traffic)
Runtime socket communications (RPC commands or test traffic)
Tgen Commands
Tgen Commands
Build step produces artifacts
Build step produces artifacts
make run-switch
make run-switch
make run-saithrift_XXXtests
make run-saithrift_XXXtests
(Git Submodule)
(Git Submodule)

make docker-XXX-publish
make docker-XXX-publish
local environment
local environ...
Various repos (Ubuntu, p4.org, etc.)
Various repos (Ubunt...
local environment
local environ...
dash-xxx
dash-xxx
dash-XXX
dash-XXX
P4 Info
P4 Info
Reg
Reg
make docker-XXX-pull (explicit)
make docker-XXX...
docker-run (implicit)
docker-run (imp...
Python thrift client  lib*
Python thrift client...
DASH Dataplane
DASH Dataplane
Git
Git
opencompute/SAI
opencompute/SAI
SAI PTF Framework
SAI PTF Framework
Scapy
Scapy
make deploy-ixia-c
make deploy-ixia-c
Build-time container
Build-time container
Run-time container
Run-time container
Test scripts:
PTF, Pytest
mounted from host dev env
Test scripts:...
make run-saithrift_dev-XXXtests
make run-saithrift_dev-XXXtests
/test-dev
/test-dev
/test
/test
/SAI
/SAI
Custom third-party implementation
Custom third-party i...
socket or in-process
socket or in...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/dash-pipeline/tests/libsai/Makefile b/dash-pipeline/tests/libsai/Makefile index 38fe7ac12..b605d2273 100644 --- a/dash-pipeline/tests/libsai/Makefile +++ b/dash-pipeline/tests/libsai/Makefile @@ -1,4 +1,7 @@ # Call make in each subdirectory +.ONESHELL: +SHELL = /bin/bash +.SHELLFLAGS += -e SUBDIRS := $(wildcard */.) diff --git a/dash-pipeline/tests/libsai/vnet_out/vnet_out.cpp b/dash-pipeline/tests/libsai/vnet_out/vnet_out.cpp index cbb2b9775..f450ab199 100644 --- a/dash-pipeline/tests/libsai/vnet_out/vnet_out.cpp +++ b/dash-pipeline/tests/libsai/vnet_out/vnet_out.cpp @@ -49,8 +49,6 @@ extern sai_status_t sai_create_dash_acl_group( extern sai_status_t sai_remove_dash_acl_group( _In_ sai_object_id_t eni_id); -extern sai_dash_api_t sai_dash_api_impl; - int main(int argc, char **argv) { sai_object_id_t switch_id = SAI_NULL_OBJECT_ID; diff --git a/documentation/general/design/dash-sonic-hld.md b/documentation/general/design/dash-sonic-hld.md index ff5d6c967..35337a4d5 100644 --- a/documentation/general/design/dash-sonic-hld.md +++ b/documentation/general/design/dash-sonic-hld.md @@ -36,6 +36,8 @@ | 0.5 | 06/13/2022 | Chris Sommers | Schema Relationships | | 0.6 | 08/05/2022 | Mukesh M Velayudhan | Outbound VNI derivation in pipeline | | 0.7 | 08/09/2022 | Prince Sunny | Add Inbound Routing rules | +| 0.6 | 04/20/2022 | Marian Pritsak | APP_DB to SAI mapping | + # About this Manual This document provides more detailed design of DASH APIs, DASH orchestration agent, Config and APP DB Schemas and other SONiC buildimage changes required to bring up SONiC image on an appliance card. General DASH HLD can be found at [dash_hld](./dash-high-level-design.md). @@ -372,6 +374,85 @@ metering_bucket = bucket_id ; metering and counter use_dst_vni = bool ; if true, use the destination VNET VNI for encap. If false or not specified, use source VNET's VNI ``` +### 3.2.9 DASH orchagent (Overlay) + +| APP_DB Table | Key | Field | SAI Attributes/*objects* | Comment | +|-----------------------|---------------|------------------|---------------------------------------------------|-------------------------------------------------| +| DASH_APPLIANCE | | | | | +| | appliance_id | | | | +| | | sip | sai_vip_entry_t.vip | | +| | | vm_vni | sai_direction_lookup_entry_t.VNI | | +| DASH_VNET | | | *SAI_OBJECT_TYPE_VNET* | | +| | vnet_name | | | | +| | | vxlan_tunnel | | VxLAN tunnel won't be used | +| | | vni | SAI_VNET_ATTR_VNI | | +| | | guid | | Not relevant | +| | | address_spaces | | | +| | | peer_list | | | +| DASH_QOS | | | | | +| | qos_name | | | | +| | | qos_id | | | +| | | bw | SAI_ENI_ATTR_PPS | | +| | | cps | SAI_ENI_ATTR_CPS | | +| | | flows | SAI_ENI_ATTR_FLOWS | | +| DASH_ENI | | | *SAI_OBJECT_TYPE_ENI* | | +| | eni | | | | +| | | eni_id* | SAI_ENI_ETHER_ADDRESS_MAP_ENTRY_ATTR_ENI_ID | | +| | | mac_address* | sai_eni_ether_address_map_entry_t.address | | +| | | eni_id** | sai_outbound_eni_to_vni_entry_t.ENI | | +| | | qos | | | +| | | vnet** | SAI_ENI_ATTR_VNET_ID | VNET object ID | +| DASH_ACL_V4_IN | | | | Same for V6 | +| | eni | | | | +| | | stage | SAI_ENI_ATTR_INBOUND_V4_stage_DASH_ACL_GROUP_ID | STAGE1..STAGE5 | +| | | acl_group_id | SAI_ENI_ATTR_INBOUND_V4_stage_DASH_ACL_GROUP_ID | | +| DASH_ACL_GROUP | | | *SAI_OBJECT_TYPE_DASH_ACL_GROUP* | | +| | group_id | | | | +| | | ip_version | SAI_DASH_ACL_GROUP_ATTR_IP_ADDR_FAMILY | | +| DASH_ACL_RULE | | | *SAI_OBJECT_TYPE_DASH_ACL_RULE* | | +| | group_id | | SAI_DASH_ACL_RULE_ATTR_GROUP_ID | | +| | rule_num | | | | +| | | priority | SAI_DASH_ACL_RULE_ATTR_PRIORITY | | +| | | action | SAI_DASH_ACL_RULE_ATTR_ACTION | | +| | | terminating | SAI_DASH_ACL_RULE_ATTR_ACTION | AND_CONTINUE if not terminating | +| | | protocol | SAI_DASH_ACL_RULE_ATTR_PROTOCOL | | +| | | src_addr | SAI_DASH_ACL_RULE_ATTR_SIP | | +| | | dst_addr | SAI_DASH_ACL_RULE_ATTR_DIP | | +| | | dst_port | SAI_DASH_ACL_RULE_ATTR_DST_PORT | | +| | | src_port | SAI_DASH_ACL_RULE_ATTR_SRC_PORT | | +| DASH_ROUTE_TABLE | | | | | +| | eni | | sai_outbound_routing_entry_t.ENI | | +| | prefix | | sai_outbound_routing_entry_t.destination | | +| | | action_type | | Need action type for future cases | +| | | vnet | SAI_OUTBOUND_ROUTING_ENTRY_ATTR_DEST_VNET_VNI | VNI value taken from DASH_VNET table | +| | | appliance | | Not supported yet | +| | | overlay_ip | SAI_OUTBOUND_ROUTING_ENTRY_ATTR_OVERLAY_IP | | +| | | underlay_ip | | Not supported yet | +| | | overlay_sip | | Not supported yet | +| | | underlay_dip | | Not supported yet | +| | | customer_addr | | Not supported yet | +| | | metering_bucket | SAI_OUTBOUND_ROUTING_ENTRY_ATTR_COUNTER_ID | | +| DASH_MAPPING_TABLE | | | | | +| | vnet | | sai_outbound_ca_to_pa_entry_t.dest_vni | VNET's VNI | +| | ip_address | | sai_outbound_ca_to_pa_entry_t.dip | | +| | | routing_type | | | +| | | underlay_ip | SAI_OUTBOUND_CA_TO_PA_ENTRY_ATTR_UNDERLAY_DIP | | +| | | mac_address | SAI_OUTBOUND_CA_TO_PA_ENTRY_ATTR_OVERLAY_DMAC | | +| | | metering_bucket | SAI_OUTBOUND_CA_TO_PA_ENTRY_ATTR_COUNTER_ID | | +| | vnet* | | sai_pa_validation_entry_t.vnet_id | VNET's VNI | +| | | underlay_ip* | sai_pa_validation_entry_t.sip | SAI_PA_VALIDATION_ENTRY_ATTR_ACTION is permit | +| DASH_ROUTE_RULE_TABLE | | | | | +| | eni | | sai_inbound_routing_entry_t.eni_id | | +| | vni | | sai_inbound_routing_entry_t.vni | | +| | prefix | | sai_inbound_routing_entry_t.prefix | | +| | | action_type | | | +| | | priority | sai_inbound_routing_entry_t.priority | | +| | | protocol | | | +| | | vnet | SAI_INBOUND_ROUTING_ENTRY_ATTR_SRC_VNET_ID | | +| | | pa_validation | SAI_INBOUND_ROUTING_ENTRY_ATTR_ACTION | use PA_VALIDATE if true | +| | | metering_bucket | | | + + ## 3.3 Module Interaction A high-level module interaction is captured in the following diagram. diff --git a/documentation/high-avail/design/AMD-Pensando_HA_Proposal.md b/documentation/high-avail/design/AMD-Pensando_HA_Proposal.md new file mode 100644 index 000000000..048b4be73 --- /dev/null +++ b/documentation/high-avail/design/AMD-Pensando_HA_Proposal.md @@ -0,0 +1,723 @@ +# DASH High Availability proposal + +## Overview + +This document describes the High availability mechanisms for DPUs in the DASH framework. High Availability is achieved by pairing DPUs such that a failure in any one in the pair results in the surviving DPU taking over forwarding on behalf of the failed DPU. Since forwarding on the DPU is stateful, all forwarding state from each DPU is synchronized with the other so that the switchover is seamless for user traffic. This document describes the procedures for such pairing and the mechanisms for synchronizing state between the pair. + +## Design Goals + +This proposal has the following design goals + +1. All connections setup before switchover should work reliably after planned and unplanned switchovers +1. 0 downtime planned switchover, <2 sec downtime unplanned switchover +1. Data packets should not be dropped due to flow replication delays +1. Sync connection setup and teardown at datapath rate to support high CPS +1. Sync only required packets to conserve PPS for data traffic + +## Functional Description + +### Terminology + +![](images/terminology.001.png) + +The above picture shows the terminology used for the layers in this document. The SONIC stack represents the sonic stack including gnmi, swss, syncd etc.. DASH SDK is the vendor implementation specific DASH SDK. The DPU is the vendor implementation of the hardware datapath and any associated sw components. + +### Network Topology + +The 2 DPUs in the pairing relation is administratively defined via configuration. Pairing and synchronization happens via the data network. Each DPU is typically connected to the data fabric via 2 interfaces which provides high availability in case of failure in one of the network paths (link or tor). Each DPU has the following IPs + +1. Link IPs connecting to the the TOR +1. A Control plane Loopback IP unique to each DPU +1. 2 Datapath VIPs shared between the pair of DPUs + +The loopback IPs (#2, and #3 above) are advertised and maintained via dynamic advertisements from the DPU. The picture below shows the network topology with 2 paired DPUs. + +![](images/topology.002.jpeg) + +#### Control Network Loopback IP (CNIP) + +The control plane Loopback IP, also referred to in this document as the CNIP, is unique to each DPU and provides the ability to address each DPU as long as some connectivity exists between the DPU and the network fabric. This is used to communicate between the paired DPUs. All control plane traffic between the DPUs is sourced from this IP and addressed to this IP. The Peer DPU is identified by its control plane loopback IP. The CNIP is used for pairing between the DPUs by exchanging control messages, for all flow synchronization packets - both bulk and datapath sync (covered later) and datapath heartbeat messages . The Same IP address is used by the datapath to originate and terminate flow sync packets. + +#### Datapath VIP + +The Datapath VIP IPs are the addresses that are used to direct traffic towards the DPU by the rest of the network. The use of 2 VIP loopback IPs allows for the DPUs to provide an ENI based active-active forwarding paradigm. + +### ENI based active-active + +The 2 Datapath VIPs (DP-VIP) are shared between the paired DPUs. Each DP-VIP is associated with a primary and a secondary DPU. In steady state each DPU has one primary DP-VIP and a secondary DP-VIP while on the peer DPU the roles are flipped. The DPU attracts traffic such that traffic for each DP-VIP lands on the primary DPU for the VIP. The DPU achieves this by signaling via network protocols to the fabric. + +Each ENI configured on the DPU is associated with a DP-VIP by the controller. In steady state for the set of ENIs managed by a DPU pair, traffic to and from a set of ENIs land on one of the DPUs while traffic to/from the remaining ENIs are handled by the other DPU. Thus the dataplanes on both the DPUs are actively forwarding traffic. On failure or administrative switchover, the secondary DPU takes over the active DPUs role for the DP-VIP and hence the DPU ends up handling traffic for both the DP-VIPs at the same time. State synchronization between the 2 DPUs ensures that on switchover there is no loss in state and switchover is seamless for user traffic. + +### Datapath Heartbeat + +Each DPU sends heartbeat messages at a configured interval to its peer. When a peer loses a set number of heartbeats it declares the peer unreachable and a switchover is initiated. The interval between heartbeats and the number of missed heartbeats are configurable. The Heartbeats can be aggressive hence it is left to the DPU to perform the heartbeat. + +## State Synchronization + +State synchronization between the 2 DPUs uses the CNIP IP. All state synchronization happens at the granularity of the DP-VIP and happens from the primary of the DP-VIP towards the secondary. State synchronization happens in 2 stages + +1. Bulk Sync +1. Data path sync + +The below figure shows the channels used for synchronization. The Bulk sync and the datapath synchronization uses 2 different channels. + +![](images/channels.003.png) + +The Control Plane channel is managed by SONiC stack. The SONiC stack relays messages from the DPU to the remote DPU in addition to originating and consuming messages. The Control Plane Channel is used to relay messages between the primary and active. This channel carries Bulk Sync messages, Control messages between the SONIC stack and control messages between the DPUs. This channel is a gRPC bidirectional stream between the 2 SONIC instances in this proposal + +In addition to the Control Plane Channel, there is a DP Sync channel. This channel is between the data paths of the DPUs. This channel is used for Datapath Synchronization which is covered in a later section. + +### Bulk Sync + +On bootup of a DPU, the DPU attempts to look for the configured peer. If the DPU is not able to reach its peer in a given interval of time, the state of the DPU is said to be “standalone” mode. Both the DP-VIPs are set to PRIMARY-STANDALONE mode. This causes the DP-VIP IP addresses to be advertised to the network. Traffic received on the DP-VIP is forwarded and flow state is built accordingly. + +As the peer DPU comes online at some later point in time and Control Plane connectivity is established between the two DPUs, the peer requests for a sync of all accumulated states. This sync is called the Bulk sync. + +The picture above shows a block schematic. The SONiC stack establishes the Control Plane Channel on discovering the liveness of the peer. The Control Plane channel is a bi-directional channel. This channel is defined as a bi-dir streaming grpc channel. The bidirectional stream allows the peers to sync in lock step in an efficient way. The messages used on the channel are defined in a later section. The usage of the message in different workflows is defined later. + +Bulk sync follows the perfect-sync method of marking all flows created during the bulk sync process with a color and syncing all flows not that color to the secondary. + +### Datapath Synchronization + +An Inline flow replication synchronization is employed to allow for reliable and timely synchronization. This synchronization models the primary and the secondary DPUs as one logical DSC. Flow setup and delete are complete only once the operation is performed on both the primary and the secondary. The primary`s policy evaluation result is honored on the secondary so no policy evaluation happens on the secondary. There is a reconciliation that is performed on switchover when the secondary becomes primary. That is discussed in a later section. + +![](images/dpreplication.004.png) + +Only Control packets for a flow are synched to the secondary. Once the flow has been synchronized to the secondary the primary does not forward any packets to the secondary. The Primary tracks the status of the synchronization to achieve the short circuit. The exact mechanisms for doing this will vary for each hardware implementation and is not covered here. The inline synchronization mechanism has the advantage that there is no buffering of data packets in the DPU. any network losses either in the Source-DPU, DPU-DPU, DPU-Dest paths are handled due to retransmission by the source (in case of TCP) or handled as regular network drops (in case of UDP/ICMP) + +The sync packets between the DPUs are logically structured as below. + +![](images/dp_repl_packet.005.png) + +### Interaction between Bulk Sync and Datapath Sync + +Due to the scale requirements for DASH the flow table size that needs to be handled during bulk sync can be very large and hence the bulk sync process can take a long time to complete. It is not possible to halt all traffic that would create new flows during this time. Hence the sync mechanism has to handle creation of new flows during bulk sync. It is also possible that there might be changes in the policy that might affect existing flows. The perfect sync mechanism calls for marking different “color”s to flows that are created after the start of bulk sync. The flow table is walked and all flows not the current color are synchronized to the peer. Any flows that are created during the bulk sync phase are inline synchronized via the datapath synchronization path. Other challenges include + +- Handling of flows that are affected by policy changes +- Flows that terminate during bulk sync +- Flow aging during bulk sync + +All these problems require the hw implementation maintain a mechanism of marking the flows with a synchronization status and handle these events as per that status. During Bulk sync there might be flows that are in the bulk sync snapshot that are affected by changes. The actual mechanism of handling would differ by implementation and is not covered here. When implementation specific signaling is needed between the DPUs for such optimizations, the control plane channel allows for such messages to be relayed. + +## HA State Machine + +![](images/ha_sm.006.png) + +**Bootup** + +This is the initial state for the box on bootup. The statemachine waits for external input to indicate that config is complete. At which point it progresses to the next state + +**Init** + +In this state the get\_capabilities call retrieves the local datapath capabilities. The SAI SDK call returns the local DPs capabilities as defined in the capabilities struct previously. This will be used in later exchanges with the peer. Validations of the local configuration as well as any other local checks are performed in this state. If there are configuration errors the state machine moves to Cfg\_err state and waits for configuration changes to fix the errors. + +**Peer Connect** + +There is a timed attempt made to connect to the peer in this stage. If there is no connectivity established with the peer within a configured timeout interval, the operational state moves to standalone-primary. + +**Standalone Primary** + +In the standalone primary mode all datapath functions are enabled and the DPU starts forwarding traffic. Since there is no peer connectivity established, no flow synchronization is needed. + +**Compat Check** + +The peer connect sequence exchanges capabilities with the peer. These received capabilities are then compared with the local capabilities for compatibility. The compat is done in 2 stages. The first is compatibility at the SONIC stack level. Any common attributes (across all implementations) is checked here. The next stage is to check compatibility between the underlying DASH implementations. This is done via a SAI-DASH call to the SDK. + +**Start Sync** + +Once the two peers are deemed compatible bulk sync is initiated between the peers. First the SONIC stack establishes a bidirectional streaming sync channel between the peers (Control Plane Channel). This channel is then passed to the DASH SDK. The underlying DPU implementation then uses the CP Channel to exchange messages between the peers. The data transfer involves Flow sync messages from Primary to Secondary. There can also be messages from the Secondary to the Primary to back pressure or optimize the primary->secondary flow data. + +**Wait Sync** + +The State machine on the SONIC stack then waits for the DP to signal completion of bulk sync in this state. The completion of the bulk sync is notified by the DP in a CP control message. At this point further states are defined by the configured Admin role of the DPU. + +**Wait Peer Sync** + +This state is on node that is in standalone primary and is servicing a bulk sync request from the newly booting up peer. SInce the Bulk sync is initiated from the new node, the existing node waits for a signal from the SDK that bulk sync has been completed. When the BulkSyncDone message is received it is relayed to the peer and also triggers the transition out of this state. + +**Wait HA Role Activation** + +As per deployment requirements the controller has the control to activate the HA role. Transition to Primary or Secondary happens only after the controller signals so. This state waits on such an external trigger to activate. + +**Activate Primary** + +This state is reached when bulk sync is complete and the admin role of the node is primary. The node then attempts to take over as the primary. This is triggered by notifying the underlying datapath to switch to the primary role. At this point the VIP routes are advertised to attract traffic. + +**Wait Primary** + +In this state the node is waiting for the Datapath to signal completion of taking over as primary. The datapath indicates this by notifying the SONIC stack via an oper status update message. At this point the peer is notified to move to standby. + +**Activate Secondary** + +This state is reached when bulk sync is complete and the admin role of the node is secondary. The node then attempts to get to the secondary state. This is triggered by notifying the underlying datapath to switch to the secondary role. At this point the VIP routes are advertised. The routes may be advertised with a less desirable metric. + +**Wait Secondary** + +In this state the node is waiting for the Datapath to signal completion of state as secondary. The datapath indicates this by notifying the SONIC stack via an oper status update message. + +**Secondary** + +If the configured role is secondary the node goes to terminal state secondary. The DPU then waits for a switchover event to switch to primary. In this state the node keeps receiving flow sync messages from the peer and keeps the datapath ready for switchover. + +**Primary** + +This is the terminal state for the node when the node is configured as the primary. In this state the local datapath is forwarding traffic actively and synching state to the peer. + +### SAI Definitions + +The SAI API calls necessary for communication between the SONIC stack and the SAI-DASH SDK are defined below. + +```cpp +#include + +/** + * @brief Notification data format for received for the DPU Control message + * callback. + */ +typedef struct _sai_dash_dpu_control_message_notification_data_t { + /** + * @brief Control message Type + */ + sai_uint16_t type; + + /** + * @brief Data for the message + */ + sai_u8_list_t data; +} sai_dash_dpu_control_message_notification_data_t; + +/** + * @brief L4 information for TCP and UDP flows. + */ +typedef struct _sai_dash_flow_tcp_udp_info_t { + /** Source port */ + sai_uint16_t src_port; + + /** Destination port */ + sai_uint16_t dst_port; +} sai_dash_flow_tcp_udp_info_t; + +/** + * @brief L4 flow information for ICMP flows. + */ +typedef struct _sai_dash_flow_icmp_info_t { + /** ICMP Type */ + sai_uint32_t type; + + /** ICMP code */ + sai_uint32_t code; + + /** ICMP ID */ + sai_uint32_t id; +} sai_dash_flow_icmp_info_t; + +/** + * @brief L4 Flow information + */ +typedef union _sai_dash_flow_l4_info_t { + /** TCP/UDP info */ + sai_dash_flow_tcp_udp_info_t tcp_udp; + + /** ICMP Info */ + sai_dash_flow_icmp_info_t icmp; +} sai_dash_flow_l4_info_t; + +/** + * @brief Notification Data format for received flow sync messages from the DPU + */ +typedef struct _sai_dash_flow_sync_message_notification_data_t { + /** + * @brief ENI MAC for this flow + */ + sai_mac_t eni_mac; + + /** + * @brief Source IP address + */ + sai_ip_address_t src_ip; + + /** + * @brief Destination IP address + */ + sai_ip_address_t dst_ip; + + /** + * @brief IP Protocol + */ + sai_uint8_t protocol; + + /** + * @brief L4 Information (TCP/UDP/ICMP) + */ + sai_dash_flow_l4_info_t l4_info; + + /** + * @brief policy results metadata + */ + sai_u8_list_t metadata; +} sai_dash_flow_sync_message_notification_data_t; + +/** + * @brief Attributes ID for get_peer_capabilities + */ +typedef enum _sai_get_peer_capabilities_attr_t { + /** + * @brief HB Interval + * @type sai_uint16_t + */ + SAI_DASH_GET_PEER_CAPABILITIES_ATTR_HB_INTERVAL, + + /** + * @brief HB Miss Count + * @type sai_uint16_t + */ + SAI_DASH_GET_PEER_CAPABILITIES_ATTR_HB_MISS_COUNT, + + /** + * @brief Capabilities + * @type sai_uint8_list_t + */ + SAI_DASH_GET_PEER_CAPABILITIES_ATTR_CAPABILITIES, + +} sai_get_peer_capabilities_attr_t; + +/** + * @brief Get Capabilities of the DP + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_get_capabilities_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _InOut_ sai_attribute_t *attr_list); + +/** + * @brief Attributes ID for register_cp_channel + */ +typedef enum _sai_register_cp_channel_attr_t { + /** + * @brief Named pipe for bi-directional control stream + * @type sai_uint8_list_t + */ + SAI_DASH_REGISTER_CP_CHANNEL_ATTR_NAMED_PIPE, +} + +/** + * @brief Register the CP control channel with the DP + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_register_cp_channel_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _In_ sai_attribute_t *attr_list); + +/** + * @brief Attributes ID for process_peer_capabilities + */ +typedef enum _sai_process_peer_capabilities_attr_t { + /** + * @brief HB Interval + * @type sai_uint16_t + */ + SAI_DASH_PROCESS_PEER_CAPABILITIES_ATTR_HB_INTERVAL, + + /** + * @brief HB Miss Count + * @type sai_uint16_t + */ + SAI_DASH_PROCESS_PEER_CAPABILITIES_ATTR_HB_MISS_COUNT, + + /** + * @brief Capabilities + * @type sai_uint8_list_t + */ + SAI_DASH_PROCESS_PEER_CAPABILITIES_ATTR_CAPABILITIES, + +} sai_process_peer_capabilities_attr_t; + +/** + * @brief Process peer capabilities of peer DPU + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_process_peer_capabilities_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _Inout_ sai_attribute_t *attr_list); + +/** + * @brief Attributes ID for process_dpu_control_message + */ +typedef enum _sai_process_dpu_control_message_attr_t { + /** + * @brief Type + * @type sai_uint16_t + */ + SAI_DASH_PROCESS_DPU_CONTROL_MESSAGE_ATTR_TYPE, + + /** + * @brief Data + * @type sai_u8_list_t + */ + SAI_DASH_PROCESS_DPU_CONTROL_MESSAGE_ATTR_DATA, +} sai_process_dpu_control_message_attr_t; + +/** + * @brief Control Messages exchanged between Datapaths of DPU + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_process_dpu_control_message_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _In_ sai_attribute_t *attr_list); + +/** + * @brief Attributes ID for process_flow_sync_message + */ +typedef enum _sai_process_flow_sync_message_attr_t { + /** + * @brief Flow information + * @type sai_dash_flow_sync_message_notification_data_t + */ + SAI_DASH_PROCESS_FLOW_SYNC_MESSAGE_ATTR_FLOW_INFO, +} sai_process_flow_sync_message_attr_t; + +/** + * @brief FLow Sync messages exchanged between of DPU + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_process_flow_sync_message_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _In_ sai_attribute_t *attr_list); + +/** + * @brief Attributes ID for oper_role_status + */ +typedef enum _sai_oper_role_status_attr_t { + /** + * @brief OperState + * @type sai_uint16_t + */ + SAI_DASH_OPER_ROLE_STATUS_ATTR_OPER_STATE, +}; + +/** + * @brief Update Oper Role state + * + * Updates from DPU for the operational role. Current state is passed as a attribute. + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_oper_role_status_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _In_ sai_attribute_t *attr_list); + +/** + * @brief Attributes ID for cp_control_message + */ +typedef enum _sai_cp_control_message_attr_t { + /** + * @brief Operation + * @type sai_uint16_t + */ + SAI_DASH_CP_CONTROL_MESSAGE_ATTR_OPERATION, +} sai_cp_control_message_attr_t; + +/** + * @brief Process CP control message + * + * Process control messages between DPUs. The Operation is passed as a attribute. + * + * @param[in] sai_object_id_t vipID + * @param[in] attr_count Number of attributes + * @param[in] attr_list Array of attributes + * + * @return #SAI_STATUS_SUCCESS on success Failure status code on error + */ +typedef sai_status_t (*sai_cp_control_message_fn) ( + _In_ sai_object_id_t vipID; + _In_ uint32_t attr_count, + _In_ sai_attribute_t *attr_list; +) + +typedef struct _sai_dash_ha_api_t { + sai_register_cp_channel_fn register_cp_channel; + sai_get_capabilities_fn get_capabilities; + sai_process_peer_capabilities_fn process_peer_capabilities; + sai_process_dpu_control_message_fn process_dpu_control_message; + sai_process_flow_sync_message_fn process_flow_sync_message; + sai_oper_role_status_fn oper_role_status; + sai_cp_control_message_fn cp_control_message; +} sai_dash_ha_api_t; +``` + +### Control Plane Channel Message Definitions + +The below definitions pertain to the GRPC channel defined as the Control Plane Channel. This channel is established between the SONIC stacks on the two peer nodes. These messages can be originated from SONIC or sent by the DPU implementation and relayed via SONIC. + +```protobuf +syntax = "proto3"; +package dashsync; + +// IP address families +enum IPAF { + IP_AF_NONE = 0; + IP_AF_INET = 1; // IPv4 + IP_AF_INET6 = 2; // IPv6 +} + +// Admin Roles for a VIP +enum AdminRole { + // Unspecified + AdminNone = 0; + // Primary node for peering session. + AdminPrimary = 1; + // Secondary node for peering session. + AdminSecondary = 2; +} + +// Operational state of the VIP +enum OperRole { + // Unspecified + OperNone = 0; + // Primary node for peering session. + OperPrimary = 1; + // Secondary node for peering session. + OperSecondary = 2; + // Standalone mode. No active peering. + OperStandalone = 3; +} + + +// IP Address object +message IPAddress { + IPAF Af = 1; + // IP address family + oneof v4_or_v6 { + // IPv4 address + fixed32 V4Addr = 2; + // IPv6 address + bytes V6Addr = 3; + } +} + +// L4 portion of flow key tuple +message FlowL4Info { + // key fields for TCP/UDP flows + message TCPUDPInfo { + uint32 SrcPort = 1; + uint32 DstPort = 2; + } + // key fields for ICMP flows + message ICMPInfo { + uint32 Type = 1; + uint32 Code = 2; + uint32 Id = 3; + } + oneof l4_info { + TCPUDPInfo TcpUdpInfo = 1; + ICMPInfo IcmpInfo = 2; + } +} + +// flow key for IP flows +message IPFlowKey { + // ENI MAC address + uint64 EniMAC = 1; + // source IP seen in the packet + IPAddress SrcIP = 2; + // destination IP seen in the packet + IPAddress DstIP = 3; + // IP protocol + uint32 IPProtocol = 4; + // L4 information of the flow key + FlowL4Info L4Info = 5; +} + +// Flow Sync Msg +message FlowSyncMsg { + message FlowInfo { + // DP-VIP associated to the flow + bytes VipId = 1; + // IP Flow tuple + IPFlowKey Key = 2; + // Metadata containing policy results + bytes Metadata = 3; + } + repeated FlowInfo Info = 1; +} + +// Control plane operations carried in CPControlMsg +enum CPControlOperation { + OpNone = 0; + OpStartBulkSync = 1; + OpBulkSyncDone = 2; + OpShutdownPrepare = 3; + OpShutdownReady = 4; + OpShutdown = 5; + OpSwitchover = 6; + OpSwitchoverReady = 7; + OpSwitchoverDone = 8; +} + +// Message used to trigger/Notify state change events between peers. +message CPControlMsg { + // The VIP this event pertains to. + bytes VipId = 1; + // Operation/Event + CPControlOperation Operation = 2; +} + +// CompatCheck used to carry the compatibility information for the node +// contains information about the DP-VIPs, the DPUs capabilities and any +// other relevant capabilities to be checked. +message CompatCheck { + // VIP related parameters. One added per VIP. + message DpVIPInfo { + bytes VipId = 1; + // Address configured + IPAddress VipIP = 2; + // Role of the VIP on this Node + AdminRole AdminRole = 3; + // Metric used for the protocol to differentiate the primary/secondary routes. + // Valid when using BGP as the underlay protocol. + uint32 ProtocolMetric = 4; + } + // DPU capabilities on this node. + message DPUInfo { + // Configured interval for HB messages + uint32 HBinterval = 1; + // Number of HB misses that will trigger switchover + uint32 MissCount = 2; + // Opaque + bytes Capabilities = 3; + } + // Compatibility/Capability information for each VIP on the node. Typically two entries. + repeated DpVIPInfo VipInfo = 1; + // Copability information for the Datapath. + DPUInfo DPUCapabilities = 2; +} + +// Results from the Compatibility check between the nodes. +enum CompatResult { + CompatSuccess = 0; + CompatFailure = 1; +} + +// Results from the Compatibility check between the nodes. +message CompatResults { + // Compatibility Error information pertaining to the VIP + message DpVIPCompatError { + // VIP this pertains to + bytes VipID = 1; + // Error code + uint32 Code = 2; + // detailed user readable reason + string Reason = 3; + } + + message DPUCompatError { + // Error code + uint32 Code = 2; + // detailed user readable reason + string Reason = 3; + } + CompatResult Result = 1; + repeated DpVIPCompatError DpVIPInfo = 2; + DPUCompatError DPUInfo = 3; +} + +// Control Message. This can be either from the Primary to Secondary or vice-versa +message DPUControlMsg { + // Control Message type + uint32 Type = 1; + // Data for the message + bytes Data = 2; +} + +// Wrapper message for all control messages between peers +message ControlMsg { + oneof ctrl_msg { + DPUControlMsg DPUControlMsg = 2; + CompatCheck CompatCheck = 3; + CompatResults CompatResults = 4; + CPControlMsg CPControlMsg = 5; + } +} + +// Wrapper message for all messages between the peers. +message SyncMsg { + oneof sync_msg { + FlowSyncMsg FlowSyncMsg = 1; + ControlMsg ControlMsg = 2; + } +} + +service CPSync { + rpc SyncChannel(stream SyncMsg) returns (stream SyncMsg) {}; +} +``` + +## Message Flows + +The following are some important procedures and corresponding control message flows + +### Node Pairing and Bulk Sync + +![](images/node_pairing.007.png) + +Node pairing follows the state machine defined in the previous section. SONIC stack does initial connection to the peer. For each DPU pair the sonic stack establishes a bi-directional streaming channel to the peer. All control messages including bulk sync uses this channel. There is also a parallel channel registered between the DPU data paths directly. This channel will carry the DP flow sync messages between the DPUs (represented by the orange dotted lines in the picture above. + +The initial compatibility messages exchanged between the peers ensures that capabilities match. These could be hardware capabilities, software capabilities, scale limits etc. As was mentioned earlier these capabilities are a combination of capabilities from the SONIC and capabilities of the DPU. + +The SONIC stack initiates bulk sync between the peers and also notifies the DPU via the SDK to start bulk sync. The DPU sends and receives flow synchronization messages via the CP stream. DPUs can back pressure or optimize the bulk sync process with its DPU peer by exchanging the DPUControlMsgs on the channel. + +As is evident from the picture above the Datapath flow synchronization may be initiated by the DPUs in parallel to the bulk sync process. + +### Unplanned Switchover + +![](images/unplanned_switchover.008.png) + +This is triggered by failures in the network or a failure on the peer DPU. This is typically detected via loss of heartbeat message between the DPUs. The DPU then notifies the SONIC stack of the change via the notification message about the HB loss and initiates the switchover to standalone state on the DPU. Once the switchover is complete the SONIC stack is notified of the change via a notification. + +All flows inserted on the secondary before switchover are after evaluation of primary`s policies. Flow resimulation triggers the flows to be evaluated as per local policies. The SONIC stack waits for confirmation from the SDN controller that all policy configurations on the DPU are updated and then initiates flow reconciliation on the DPU. This triggers flow resimulation to update policy results on flows as per current policy configuration. + +### Planned Switchover + +![](images/planned_switchover.009.jpeg) + +Switchover can be a planned event for maintenance and other reasons. With planned switchover the goal is to have close to zero loss and to coordinate between the primary and secondary to achieve this goal. Both the DP-VIPs will switch roles to primary on this trigger. + +The controller initiates the planned switchover and notifies the secondary DPU to initiate switchover. Once switchover is complete the newly primary DPU relays a SwitchoverDone message to the old primary DPU. The old primary initiates a withdrawal of protocol routes so the network can drain traffic. During this time the old primary continues to forward traffic so any traffic in transit is forwarded without being dropped. During this network convergence timeout both the primary and secondary are forwarding traffic and flow sync messages may be exchanged in both directions. + +After the network convergence time the new Primary enters PRE\_STANDALONE state and waits for a flush time out and transitions to standalone state. +**AMD-Pensando** 25 diff --git a/documentation/high-avail/design/images/channels.003.png b/documentation/high-avail/design/images/channels.003.png new file mode 100644 index 000000000..fc29590b3 Binary files /dev/null and b/documentation/high-avail/design/images/channels.003.png differ diff --git a/documentation/high-avail/design/images/dp_repl_packet.005.png b/documentation/high-avail/design/images/dp_repl_packet.005.png new file mode 100644 index 000000000..39c73c5c4 Binary files /dev/null and b/documentation/high-avail/design/images/dp_repl_packet.005.png differ diff --git a/documentation/high-avail/design/images/dpreplication.004.png b/documentation/high-avail/design/images/dpreplication.004.png new file mode 100644 index 000000000..5a70e2d72 Binary files /dev/null and b/documentation/high-avail/design/images/dpreplication.004.png differ diff --git a/documentation/high-avail/design/images/ha_sm.006.png b/documentation/high-avail/design/images/ha_sm.006.png new file mode 100644 index 000000000..4f9bbbd42 Binary files /dev/null and b/documentation/high-avail/design/images/ha_sm.006.png differ diff --git a/documentation/high-avail/design/images/node_pairing.007.png b/documentation/high-avail/design/images/node_pairing.007.png new file mode 100644 index 000000000..565d649c8 Binary files /dev/null and b/documentation/high-avail/design/images/node_pairing.007.png differ diff --git a/documentation/high-avail/design/images/planned_switchover.009.jpeg b/documentation/high-avail/design/images/planned_switchover.009.jpeg new file mode 100644 index 000000000..a9b0f8d02 Binary files /dev/null and b/documentation/high-avail/design/images/planned_switchover.009.jpeg differ diff --git a/documentation/high-avail/design/images/terminology.001.png b/documentation/high-avail/design/images/terminology.001.png new file mode 100644 index 000000000..653b54470 Binary files /dev/null and b/documentation/high-avail/design/images/terminology.001.png differ diff --git a/documentation/high-avail/design/images/topology.002.jpeg b/documentation/high-avail/design/images/topology.002.jpeg new file mode 100644 index 000000000..ef78f063e Binary files /dev/null and b/documentation/high-avail/design/images/topology.002.jpeg differ diff --git a/documentation/high-avail/design/images/unplanned_switchover.008.png b/documentation/high-avail/design/images/unplanned_switchover.008.png new file mode 100644 index 000000000..961687676 Binary files /dev/null and b/documentation/high-avail/design/images/unplanned_switchover.008.png differ