Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ansible/roles/test/files/ptftests/py3/sflow_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,15 @@ def packet_analyzer(self, port_sample, collector, poll_test):
% (data['total_counter_count'], collector))
else:
logging.info("..Analyzing polling test counter packets")
self.assertTrue(data['total_samples'] != 0,
"....Packets are not received in active collector ,%s" % collector)
self.assertTrue(data['total_counter_count'] != 0,
"....Counter packets are not received in active collector ,%s" % collector)
self.analyze_counter_sample(
data, collector, self.polling_int, port_sample)
else:
logging.info(
"Analyzing flow samples in collector %s" % collector)
self.assertTrue(data['total_samples'] != 0,
"....Packets are not received in active collector ,%s" % collector)
self.assertTrue(data['total_flow_count'] != 0,
"....Flow packets are not received in active collector ,%s" % collector)
self.analyze_flow_sample(data, collector)
return data

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4432,12 +4432,6 @@ scripts:
#######################################
##### sflow #####
#######################################
sflow/test_sflow.py:
skip:
reason: "The testcase is skipped due to github issue #21701"
conditions:
- "https://github.com/sonic-net/sonic-mgmt/issues/21701"

sflow/test_sflow.py::TestReboot::testFastreboot:
skip:
reason: "Dualtor topology doesn't support advanced-reboot"
Expand Down
67 changes: 66 additions & 1 deletion tests/sflow/test_sflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
--enable_sflow_feature: Enable sFlow feature on DUT. Default is disabled
"""

import ast
import pytest
import logging
import time
import json
import re

Expand Down Expand Up @@ -229,6 +231,58 @@ def config_sflow_interfaces(duthost, intf, **kwargs):
# ----------------------------------------------------------------------------------


def verify_hsflowd_ready(duthost, collector_ips):
"""
Verify hsflowd has fully initialized with all specified collector configurations.
This is done by checking if /etc/hsflowd.auto contains an entry for each collector IP.

Args:
duthost: DUT host object
collector_ips: List of collector IP addresses to check for

Returns:
True if hsflowd.auto contains entries for all collector IPs, False otherwise
"""
return all(
duthost.shell(
f"docker exec sflow grep -q 'collector={ip}' /etc/hsflowd.auto 2>/dev/null",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have you verified that this works if we are hitting the case where SYSTEM_READY is not up yet? I'm pretty sure hsflowd.auto gets generated before the timeout for SYSTEM_READY runs, which is what we are trying to wait for here. There might be value in also checking for hsflowd to be up, but I don't think it solves that particular problem case.

Copy link
Copy Markdown
Contributor Author

@vkjammala-arista vkjammala-arista Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From sflow HLD: hsflowd daemon will initialize only after receiving the SYSTEM_READY|SYSTEM_STATE Status=Up from SONiC. The system ready state however depends on all the monitored daemons to be ready. Failure on any of these, will result in system state to be down. In such scenarios, sflow will wait until 180 seconds and if system ready is not up will proceed with initialization

IIUC, hsflowd processes the collector configuration only after SYSTEM_READY is up, so that's the reason why I have added 240 secs (see wait_until_hsflowd_ready) wait_until hsflowd processes the collector config (i.e hsflowd.auto to have collector_ip information)

And I'm not seeing any issues around this with this PR fix. Please let me know if my understanding isn't correct.

Copy link
Copy Markdown
Contributor

@anders-nexthop anders-nexthop Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my change #21195 which has a very similar hsflow_ready check, as well as an explicit SYSTEM_READY check. I found that 180s works fine for the timeout, is there a reason you went with 240s?

Copy link
Copy Markdown
Contributor Author

@vkjammala-arista vkjammala-arista Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how helpful or reliable the SYSTEM_READY check is, and I don't know exactly how the sFlow/hsflowd process uses this SYSTEM_READY state. So, in my opinion, as far as the sFlow test is concerned, checking whether hsflowd has processed the collector IP configuration should be enough.

I have gone through your hsflow_ready check, I see you aren't checking whether collector config is actually processed or not (checking on hsflowd process is up or not, might not be sufficient in some scenarios)

is there a reason you went with 240s?
No specific reason, I just wanted to give some extra delay than what HLD stated (i.e >180 secs) for test to be more reliable.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SYSTEM_READY is a top-level concept in SONiC, and is definitely something we can and should use if we need to.

For this case, though, I agree with you.

We can inspect the hsflowd source directly and see what it does. Here's the SYSTEM_READY check (from https://github.com/sflow/host-sflow/blob/05be6bd61bd9926cd8a0e0d837a69165fd8add71/src/Linux/mod_sonic.c#L2355):

    case HSP_SONIC_STATE_WAIT_READY:
      // all dbs connected - wait for SYSTEM_READY
      {
	time_t waiting = mdata->pollBus->now.tv_sec - mdata->waitReadyStart;
	if(waiting < sp->sonic.waitReady) {
	  db_getSystemReady(mod);
	}
	else {
	  EVDebug(mod, 1, "sonic: waitReady timeout after %ld seconds", (long)waiting);
	  setSonicState(mod, HSP_SONIC_STATE_CONNECTED);
	}

db_getSystemReady(mod) checks for SYTEM_READY|SYSTEM_STATE=UP, so waiting for SYSTEM_READY is an appropriate check. Howerver, the collectors are not processed until hsflowd is in the CONNECTED state, which doesn't happen until hsflowd sees that the SYSTEM_READY check succeeds or times out (the timeout is 180s, which is where the 3-minute timeline in the HLD comes from) (https://github.com/sflow/host-sflow/blob/05be6bd61bd9926cd8a0e0d837a69165fd8add71/src/Linux/mod_sonic.c#L2374):

    case HSP_SONIC_STATE_CONNECTED:
      // connected and ready - learn config
      db_getMeta(mod);
      dbEvt_subscribe(mod);
      // the next steps read the starting agent/polling/collector
      // config. Any subsequent changes will be detected via dbEvt.
      setSonicState(mod, HSP_SONIC_STATE_SFLOWGLOBAL);
      break;

So, in this case, checking for collectors works as a stand-in for checking SYSTEM_READY, as long as we include a long enough timeout. I looked into different failure cases, but hsflowd truncates the .auto file when it starts, and we would only hit the SYSTEM_READY thing in the case of an hsflowd restart.

I will update my PR, it has some further fixes that are still valuable (I reworked the PTF sample timing to catch intermittent cases where startup delay fluctuations can occasionally case failures).

If you want to merge this one, can you get rid of the test skip and trigger the CI/CD?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @anders-nexthop for the details and pointers to SYSTEM_READY checks. I have updated my PR to get rid of test skip.

module_ignore_errors=True
)['rc'] == 0
for ip in collector_ips
)


def wait_until_hsflowd_ready(duthost, collector_ips):
"""
Wait until hsflowd has fully initialized with all specified collector configurations.

Retries every 10 seconds for up to 240 seconds (4 minutes). This timeout accounts for
cases where hsflowd takes over 3 minutes to initialize (e.g., first-time sflow config
enable or device reboot).

Args:
duthost: DUT host object
collector_ips: List of collector IP addresses that must all be present in hsflowd.auto

Raises:
AssertionError: If not all collectors are initialized within 240 seconds
"""
logger.info(f"Waiting for hsflowd to initialize with collector(s): {collector_ips}")
start_time = time.time()
pytest_assert(
wait_until(
240, 10, 0, # 4 minutes max, check every 10 seconds
verify_hsflowd_ready,
duthost,
collector_ips,
),
f"hsflowd failed to initialize collector(s) {collector_ips} within 240 seconds. "
f"Check /etc/hsflowd.auto in sflow container."
)
elapsed = time.time() - start_time
logger.info(f"hsflowd initialized with all collector(s) after {elapsed:.1f} seconds")


def config_sflow_collector(duthost, collector, config):
collector = var[collector]
if config == 'add':
Expand Down Expand Up @@ -282,7 +336,9 @@ def verify_sflow_interfaces(duthost, intf, status, sampling_rate):


@pytest.fixture
def partial_ptf_runner(request, ptfhost, tbinfo):
def partial_ptf_runner(request, duthosts, rand_one_dut_hostname, ptfhost, tbinfo):
duthost = duthosts[rand_one_dut_hostname]

def _partial_ptf_runner(**kwargs):
logger.info(f'The enabled sflow interface is: {kwargs}')
params = {'testbed_type': tbinfo['topo']['name'],
Expand All @@ -291,6 +347,15 @@ def _partial_ptf_runner(**kwargs):
'agent_id': var['lo_ip'],
'sflow_ports_file': "/tmp/sflow_ports.json"}
params.update(kwargs)

# Make sure hsflowd daemon has processed collector config before
# proceeding with traffic verification.
collectors = kwargs.get('active_collectors', '[]')
collector_list = ast.literal_eval(collectors or '[]') if isinstance(collectors, str) else collectors
collector_ips = [var[collector]['ip_addr'] for collector in collector_list]
if collector_ips:
wait_until_hsflowd_ready(duthost, collector_ips)

ptf_runner(host=ptfhost,
testdir="ptftests",
platform_dir="ptftests",
Expand Down
Loading