-
Notifications
You must be signed in to change notification settings - Fork 310
fix finding of failed tests in output of PyTorch test step #2859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
0f6a9b1
7187b15
63abd5d
b296c04
b8770f7
eaf4056
be911fc
4952598
8e0ccbf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ | |
| import re | ||
| import tempfile | ||
| import easybuild.tools.environment as env | ||
| from collections import namedtuple | ||
| from distutils.version import LooseVersion | ||
| from easybuild.easyblocks.generic.pythonpackage import PythonPackage | ||
| from easybuild.framework.easyconfig import CUSTOM | ||
|
|
@@ -42,6 +43,94 @@ | |
| from easybuild.tools.systemtools import POWER, get_cpu_architecture | ||
|
|
||
|
|
||
| def get_count_for_pattern(regex, text): | ||
| """Match the regexp containing a single group and return the integer value of the matched group. | ||
| Return zero if no or more than 1 match was found and warn for the latter case | ||
| """ | ||
| match = re.findall(regex, text) | ||
| if len(match) == 1: | ||
| return int(match[0]) | ||
| elif len(match) > 1: | ||
| # Shouldn't happen, but means something went wrong with the regular expressions. | ||
| # Throw warning, as the build might be fine, no need to error on this. | ||
| warn_msg = "Error in counting the number of test failures in the output of the PyTorch test suite.\n" | ||
| warn_msg += "Please check the EasyBuild log to verify the number of failures (if any) was acceptable." | ||
| print_warning(warn_msg) | ||
| return 0 | ||
|
|
||
|
|
||
| def extract_failed_tests_info(tests_out): | ||
| """ | ||
| Extract information about failed tests from output produced by PyTorch tests | ||
| """ | ||
|
|
||
| # Create clear summary report | ||
| failure_report = "" | ||
| failure_cnt = 0 | ||
| error_cnt = 0 | ||
| failed_test_suites = [] | ||
|
|
||
| # Look for patterns like: | ||
| # Ran 219 tests in 67.325s | ||
| # | ||
| # FAILED (errors=10, skipped=190, expected failures=6) | ||
| # test_fx failed! | ||
| regex = (r"^Ran (?P<test_cnt>[0-9]+) tests.*$\n\n" | ||
| r"FAILED \((?P<failure_summary>.*)\)$\n" | ||
| r"(?:^(?:(?!failed!).)*$\n)*" | ||
| r"(?P<failed_test_suite_name>.*) failed!(?: Received signal: \w+)?\s*$") | ||
|
|
||
| for match in re.finditer(regex, tests_out, re.M): | ||
| # E.g. 'failures=3, errors=10, skipped=190, expected failures=6' | ||
| failure_summary, test_cnt, test_suite = match.group('failure_summary', 'test_cnt', 'failed_test_suite_name') | ||
| failure_report += test_suite + ' (' + test_cnt + " total tests, " + failure_summary + ')\n' | ||
| failure_cnt += get_count_for_pattern(r"(?<!expected )failures=([0-9]+)", failure_summary) | ||
| error_cnt += get_count_for_pattern(r"errors=([0-9]+)", failure_summary) | ||
| failed_test_suites.append(test_suite) | ||
|
|
||
| # Look for patterns like: | ||
| # ===================== 2 failed, 128 passed, 2 skipped, 2 warnings in 3.43s ===================== | ||
| regex = r"^=+ (?P<failure_summary>.*) in [0-9]+\.*[0-9]*[a-zA-Z]* =+$\n(?P<failed_test_suite_name>.*) failed!$" | ||
|
|
||
| for match in re.finditer(regex, tests_out, re.M): | ||
| # E.g. '2 failed, 128 passed, 2 skipped, 2 warnings' | ||
| failure_summary, test_suite = match.group('failure_summary', 'failed_test_suite_name') | ||
| failure_report += test_suite + ' ' + failure_summary + '\n' | ||
| failure_cnt += get_count_for_pattern(r"([0-9]+) failed", failure_summary) | ||
| error_cnt += get_count_for_pattern(r"([0-9]+) error", failure_summary) | ||
| failed_test_suites.append(test_suite) | ||
|
|
||
| # Look for patterns like: | ||
| # AssertionError: 4 unit test(s) failed: | ||
| # ... | ||
| # distributed/test_c10d_gloo failed! | ||
| regex = (r"^AssertionError: (?P<failed_test_cnt>[0-9]+) unit test\(s\) failed:$\n" | ||
| r"(?:^(?:(?!failed!).)*$\n)*" | ||
| r"(?P<failed_test_group_name>.*) failed!$") | ||
| for match in re.finditer(regex, tests_out, re.M): | ||
| test_group, failed_test_cnt = match.group('failed_test_group_name', 'failed_test_cnt') | ||
| failure_report += test_group + ' (' + failed_test_cnt + " failed tests)\n" | ||
| failure_cnt += int(failed_test_cnt) | ||
| failed_test_suites.append(test_group) | ||
|
|
||
| # Look for patterns like: | ||
| # Running test_jit_cuda_fuser ... [2023-01-12 04:04:08.949222] | ||
| # ... | ||
| # AttributeError: 'NoneType' object has no attribute 'split' | ||
| # test_jit_cuda_fuser failed! | ||
| regex = (r"^Running (?P<test_name>test_[^\s]+) .*\n" | ||
| r"(?:^(?:(?!(Ran [0-9]+|failed!)).)*$\n)*" | ||
| r"\1 failed!$") | ||
| for match in re.finditer(regex, tests_out, re.M): | ||
| test_name = match.group('test_name') | ||
| failure_report += test_name + " (unknown failed test count)\n" | ||
| failure_cnt += 1 | ||
| failed_test_suites.append(test_name) | ||
|
|
||
| TestsInfo = namedtuple('TestsInfo', ('failure_report', 'failure_cnt', 'error_cnt', 'failed_test_suites')) | ||
| return TestsInfo(failure_report, failure_cnt, error_cnt, failed_test_suites) | ||
|
|
||
|
|
||
| class EB_PyTorch(PythonPackage): | ||
| """Support for building/installing PyTorch.""" | ||
|
|
||
|
|
@@ -268,65 +357,11 @@ def test_step(self): | |
|
|
||
| tests_out, tests_ec = super(EB_PyTorch, self).test_step(return_output_ec=True) | ||
|
|
||
| def get_count_for_pattern(regex, text): | ||
| """Match the regexp containing a single group and return the integer value of the matched group. | ||
| Return zero if no or more than 1 match was found and warn for the latter case | ||
| """ | ||
| match = re.findall(regex, text) | ||
| if len(match) == 1: | ||
| return int(match[0]) | ||
| elif len(match) > 1: | ||
| # Shouldn't happen, but means something went wrong with the regular expressions. | ||
| # Throw warning, as the build might be fine, no need to error on this. | ||
| warn_msg = "Error in counting the number of test failures in the output of the PyTorch test suite.\n" | ||
| warn_msg += "Please check the EasyBuild log to verify the number of failures (if any) was acceptable." | ||
| print_warning(warn_msg) | ||
| return 0 | ||
|
|
||
| # Create clear summary report | ||
| failure_report = "" | ||
| failure_cnt = 0 | ||
| error_cnt = 0 | ||
| failed_test_suites = [] | ||
|
|
||
| # Grep for patterns like: | ||
| # Ran 219 tests in 67.325s | ||
| # | ||
| # FAILED (errors=10, skipped=190, expected failures=6) | ||
| # test_fx failed! | ||
| regex = (r"^Ran (?P<test_cnt>[0-9]+) tests.*$\n\n" | ||
| r"FAILED \((?P<failure_summary>.*)\)$\n" | ||
| r"(?:^(?:(?!failed!).)*$\n)*" | ||
| r"(?P<failed_test_suite_name>.*) failed!(?: Received signal: \w+)?\s*$") | ||
|
|
||
| for m in re.finditer(regex, tests_out, re.M): | ||
| # E.g. 'failures=3, errors=10, skipped=190, expected failures=6' | ||
| failure_summary = m.group('failure_summary') | ||
| total, test_suite = m.group('test_cnt', 'failed_test_suite_name') | ||
| failure_report += "{test_suite} ({total} total tests, {failure_summary})\n".format( | ||
| test_suite=test_suite, total=total, failure_summary=failure_summary | ||
| ) | ||
| failure_cnt += get_count_for_pattern(r"(?<!expected )failures=([0-9]+)", failure_summary) | ||
| error_cnt += get_count_for_pattern(r"errors=([0-9]+)", failure_summary) | ||
| failed_test_suites.append(test_suite) | ||
|
|
||
| # Grep for patterns like: | ||
| # ===================== 2 failed, 128 passed, 2 skipped, 2 warnings in 3.43s ===================== | ||
| regex = r"^=+ (?P<failure_summary>.*) in [0-9]+\.*[0-9]*[a-zA-Z]* =+$\n(?P<failed_test_suite_name>.*) failed!$" | ||
|
|
||
| for m in re.finditer(regex, tests_out, re.M): | ||
| # E.g. '2 failed, 128 passed, 2 skipped, 2 warnings' | ||
| failure_summary = m.group('failure_summary') | ||
| test_suite = m.group('failed_test_suite_name') | ||
| failure_report += "{test_suite} ({failure_summary})\n".format( | ||
| test_suite=test_suite, failure_summary=failure_summary | ||
| ) | ||
| failure_cnt += get_count_for_pattern(r"([0-9]+) failed", failure_summary) | ||
| error_cnt += get_count_for_pattern(r"([0-9]+) error", failure_summary) | ||
| failed_test_suites.append(test_suite) | ||
| failed_tests_info = extract_failed_tests_info(tests_out) | ||
| failure_report = failed_tests_info.failure_report | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: To avoid long distracting lines like |
||
|
|
||
| # Make the names unique and sorted | ||
| failed_test_suites = sorted(set(failed_test_suites)) | ||
| failed_test_suites = sorted(set(failed_tests_info.failed_test_suites)) | ||
| # Gather all failed tests suites in case we missed any (e.g. when it exited due to syntax errors) | ||
| # Also unique and sorted to be able to compare the lists below | ||
| all_failed_test_suites = sorted(set( | ||
|
|
@@ -341,17 +376,17 @@ def get_count_for_pattern(regex, text): | |
| failure_report += '\n' + failure_report_save | ||
|
|
||
| # Calculate total number of unsuccesful and total tests | ||
| failed_test_cnt = failure_cnt + error_cnt | ||
| failed_test_cnt = failed_tests_info.failure_cnt + failed_tests_info.error_cnt | ||
| test_cnt = sum(int(hit) for hit in re.findall(r"^Ran (?P<test_cnt>[0-9]+) tests in", tests_out, re.M)) | ||
| max_failed_tests = self.cfg['max_failed_tests'] | ||
| self.log.info("%d unsuccessful tests (out of %d), - max. failed tests set to %d", | ||
| failed_test_cnt, test_cnt, max_failed_tests) | ||
|
|
||
| if failed_test_cnt > 0: | ||
| max_failed_tests = self.cfg['max_failed_tests'] | ||
|
|
||
| failure_or_failures = 'failure' if failure_cnt == 1 else 'failures' | ||
| error_or_errors = 'error' if error_cnt == 1 else 'errors' | ||
| msg = "%d test %s, %d test %s (out of %d):\n" % ( | ||
| failure_cnt, failure_or_failures, error_cnt, error_or_errors, test_cnt | ||
| ) | ||
| failure_or_failures = 'failure' if failed_tests_info.failure_cnt == 1 else 'failures' | ||
| msg = "%d test %s, " % (failed_tests_info.failure_cnt, failure_or_failures) | ||
| error_or_errors = 'error' if failed_tests_info.error_cnt == 1 else 'errors' | ||
| msg += "%d test %s (out of %d):\n" % (failed_tests_info.error_cnt, error_or_errors, test_cnt) | ||
| msg += failure_report | ||
|
|
||
| # If no tests are supposed to fail or some failed for which we were not able to count errors fail now | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ | |
| from easybuild.base.testing import TestCase | ||
| from easybuild.easyblocks.generic.cmakemake import det_cmake_version | ||
| from easybuild.easyblocks.generic.toolchain import Toolchain | ||
| from easybuild.easyblocks import pytorch | ||
| from easybuild.framework.easyblock import EasyBlock, get_easyblock_instance | ||
| from easybuild.framework.easyconfig.easyconfig import process_easyconfig | ||
| from easybuild.tools import config | ||
|
|
@@ -51,6 +52,97 @@ | |
| from easybuild.tools.options import set_tmpdir | ||
| from easybuild.tools.py2vs3 import StringIO | ||
|
|
||
| PYTORCH_TESTS_OUTPUT = """ | ||
| ... | ||
| AssertionError: Expected zero exit code but got -6 for pid: 2006681 | ||
|
|
||
| ---------------------------------------------------------------------- | ||
| Ran 2 tests in 6.576s | ||
|
|
||
| FAILED (failures=2) | ||
| distributed/fsdp/test_fsdp_input failed! | ||
| Running distributed/fsdp/test_fsdp_multiple_forward ... [2023-01-12 05:46:45.746098] | ||
|
|
||
| RuntimeError: Process 0 terminated or timed out after 610.0615825653076 seconds | ||
|
|
||
| ---------------------------------------------------------------------- | ||
| Ran 1 test in 610.744s | ||
|
|
||
| FAILED (errors=1) | ||
| Test exited with non-zero exitcode 1. Command to reproduce: /software/Python/3.9.6-GCCcore-11.2.0/bin/python distributed/test_c10d_gloo.py -v DistributedDataParallelTest.test_ddp_comm_hook_register_just_once | ||
|
|
||
| RuntimeError: Process 0 terminated or timed out after 610.0726096630096 seconds | ||
|
|
||
| ---------------------------------------------------------------------- | ||
| Ran 1 test in 610.729s | ||
|
|
||
| FAILED (errors=1) | ||
| Test exited with non-zero exitcode 1. Command to reproduce: /software/Python/3.9.6-GCCcore-11.2.0/bin/python distributed/test_c10d_gloo.py -v DistributedDataParallelTest.test_ddp_invalid_comm_hook_init | ||
| test_ddp_invalid_comm_hook_return_type (__main__.DistributedDataParallelTest) | ||
|
|
||
| AssertionError: 4 unit test(s) failed: | ||
| DistributedDataParallelTest.test_ddp_comm_hook_register_just_once | ||
| DistributedDataParallelTest.test_ddp_invalid_comm_hook_init | ||
| ProcessGroupGlooTest.test_round_robin | ||
| ProcessGroupGlooTest.test_round_robin_create_destroy | ||
| distributed/test_c10d_gloo failed! | ||
| Running distributed/test_c10d_nccl ... [2023-01-12 07:43:41.085197] | ||
|
|
||
| ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are: | ||
| 4.535600093557479e-05 | ||
|
|
||
| ---------------------------------------------------------------------- | ||
| Ran 216 tests in 22.396s | ||
|
|
||
| FAILED (errors=4) | ||
| distributions/test_distributions failed! | ||
|
|
||
| Running test_autograd ... [2023-01-13 04:19:25.587981] | ||
| Executing ['/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_autograd.py', '-v'] ... [2023-01-13 04:19:25.588074] | ||
| ... | ||
| test_autograd_views_codegen (__main__.TestAutograd) ... ok | ||
| ... | ||
| ====================================================================== | ||
| FAIL: test_thread_shutdown (__main__.TestAutograd) | ||
| ---------------------------------------------------------------------- | ||
| Traceback (most recent call last): | ||
| File "/tmp/vsc40023/easybuild_build/PyTorch/1.11.0/foss-2021b/pytorch-v1.11.0/test/test_autograd.py", line 4220, in test_thread_shutdown | ||
| self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown") | ||
| AssertionError: Regex didn't match: 'PYTORCH_API_USAGE torch.autograd.thread_shutdown' not found in 'PYTORCH_API_USAGE torch.python.import\nPYTORCH_API_USAGE c10d.python.import\nPYTORCH_API_USAGE tensor.create\n' | ||
| ---------------------------------------------------------------------- | ||
| Ran 464 tests in 18.443s | ||
|
|
||
| FAILED (failures=1, skipped=52, expected failures=1) | ||
| test_autograd failed! | ||
| Running test_binary_ufuncs ... [2023-01-12 09:02:45.049490] | ||
| ... | ||
|
|
||
| Running test_jit_cuda_fuser ... [2023-01-12 04:04:08.949222] | ||
| Executing ['/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_jit_cuda_fuser.py', '-v'] ... [2023-01-12 04:04:08.949319] | ||
| CUDA not available, skipping tests | ||
| monkeytype is not installed. Skipping tests for Profile-Directed Typing | ||
| Traceback (most recent call last): | ||
| File "/tmp/easybuild_build/PyTorch/1.11.0/foss-2021b/pytorch-v1.11.0/test/test_jit_cuda_fuser.py", line 25, in <module> | ||
| CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')) | ||
| AttributeError: 'NoneType' object has no attribute 'split' | ||
| test_jit_cuda_fuser failed! | ||
| ... | ||
| Running distributions/test_constraints ... [2023-01-12 09:05:15.013470] | ||
| SKIPPED [2] distributions/test_constraints.py:83: `biject_to` not implemented. | ||
| FAILED distributions/test_constraints.py::test_constraint[True-constraint_fn5-False-value5] | ||
| FAILED distributions/test_constraints.py::test_constraint[True-constraint_fn7-True-value7] | ||
| ============= 2 failed, 128 passed, 2 skipped, 2 warnings in 8.66s ============= | ||
| distributions/test_constraints failed! | ||
|
|
||
| Running distributions/rpc/test_tensorpipe_agent ... [2023-01-12 09:06:37.093571] | ||
| ... | ||
| Ran 123 tests in 7.549s | ||
|
|
||
| FAILED (errors=2, skipped=2) | ||
| ... | ||
| test_fx failed! Received signal: SIGSEGV | ||
| """ # noqa | ||
boegel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| class EasyBlockSpecificTest(TestCase): | ||
| """ Baseclass for easyblock testcases """ | ||
|
|
@@ -265,6 +357,39 @@ def test_det_cmake_version(self): | |
| """)) | ||
| self.assertEqual(det_cmake_version(), '1.2.3-rc4') | ||
|
|
||
| def test_pytorch_extract_failed_tests_info(self): | ||
| """ | ||
| Test extract_failed_tests_info function from PyTorch easyblock. | ||
| """ | ||
| res = pytorch.extract_failed_tests_info(PYTORCH_TESTS_OUTPUT) | ||
| self.assertEqual(len(res), 4) | ||
|
|
||
| expected_failure_report = '\n'.join([ | ||
| "distributed/fsdp/test_fsdp_input (2 total tests, failures=2)", | ||
| "distributions/test_distributions (216 total tests, errors=4)", | ||
| "test_autograd (464 total tests, failures=1, skipped=52, expected failures=1)", | ||
| "test_fx (123 total tests, errors=2, skipped=2)", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This or the SIGSEGV test needs to be another one or
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we have a bug here. For the output of it must NOT match the RegExp. The original, correct output is: So the RegExp must be made to only add the counting of "FAILED" if the next line is the failed test, see https://gist.github.com/Flamefire/dc1403ccefdebfc3412c6fbb2d5cbabd#file-pytorch-1-9-0-foss-2020b_partial-log-L480 I would actually add this wrong output as a test against regressions. With that snipped it should only find |
||
| "distributions/test_constraints 2 failed, 128 passed, 2 skipped, 2 warnings", | ||
| "distributed/test_c10d_gloo (4 failed tests)", | ||
| "test_jit_cuda_fuser (unknown failed test count)", | ||
| ]) | ||
| self.assertEqual(res.failure_report.strip(), expected_failure_report) | ||
| # test failures | ||
boegel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| self.assertEqual(res.failure_cnt, 10) | ||
| # test errors | ||
| self.assertEqual(res.error_cnt, 6) | ||
|
|
||
| expected_failed_test_suites = [ | ||
| 'distributed/fsdp/test_fsdp_input', | ||
| 'distributions/test_distributions', | ||
| 'test_autograd', | ||
| 'test_fx', | ||
| 'distributions/test_constraints', | ||
| 'distributed/test_c10d_gloo', | ||
| 'test_jit_cuda_fuser', | ||
| ] | ||
| self.assertEqual(res.failed_test_suites, expected_failed_test_suites) | ||
|
|
||
|
|
||
| def suite(): | ||
| """Return all easyblock-specific tests.""" | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.