Skip to content

Commit 42126d6

Browse files
authored
Merge pull request #4643 from ajdavis/issue-4623-filter-condition
permit up to 99% assume() failures
2 parents 34282ed + f69cad6 commit 42126d6

7 files changed

Lines changed: 101 additions & 21 deletions

File tree

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ their individual contributions.
88

99
.. NOTE - this list is in alphabetical order by first name (or handle).
1010
11+
* `A. Jesse Jiryu Davis <https://github.com/ajdavis>`_
1112
* `Aaron Meurer <https://github.com/asmeurer>`_
1213
* `Adam Johnson <https://github.com/adamchainz>`_
1314
* `Adam Matan <https://github.com/adamatan/adamatan>_`

hypothesis-python/RELEASE.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
RELEASE_TYPE: patch
2+
3+
This patch makes Hypothesis more tolerant of slow-to-satisfy ``assume()`` calls.
4+
Previously, Hypothesis would give up after ``max_examples * 10`` attempts; now it
5+
uses a statistical test to stop only when 99% confident that <1% of examples
6+
would pass (:issue:`4623`).
7+
8+
Thanks to @ajdavis for this improvement!

hypothesis-python/src/hypothesis/internal/conjecture/engine.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,30 @@ def timing_report(self) -> str:
154154
return "\n".join(out)
155155

156156

157+
# Stop when 99% confident the true valid rate is below 1%.
158+
# For k valid examples, we need n invalid such that:
159+
# P(seeing <= k valid in n+k trials | rate=1%) <= 1%
160+
# k=0: (0.99)^n <= 0.01 -> n >= ln(0.01)/ln(0.99)
161+
# Each additional valid example adds ~ln(0.01)/ln(0.99)/3 to threshold.
162+
def _calculate_thresholds(
163+
confidence: float = 0.99, min_valid_rate: float = 0.01
164+
) -> tuple[int, int]:
165+
log_confidence = math.log(1 - confidence)
166+
log_invalid_rate = math.log(1 - min_valid_rate)
167+
base = math.ceil(log_confidence / log_invalid_rate)
168+
# Approximate increase per valid example (from binomial CDF)
169+
per_valid = math.ceil(base / 3)
170+
return base, per_valid
171+
172+
173+
INVALID_THRESHOLD_BASE, INVALID_PER_VALID = _calculate_thresholds()
174+
175+
157176
class ExitReason(Enum):
158177
max_examples = "settings.max_examples={s.max_examples}"
159178
max_iterations = (
160179
"settings.max_examples={s.max_examples}, "
161-
"but < 10% of examples satisfied assumptions"
180+
"but < 1% of examples satisfied assumptions"
162181
)
163182
max_shrinks = f"shrunk example {MAX_SHRINKS} times"
164183
finished = "nothing left to do"
@@ -724,12 +743,11 @@ def _backend_cannot_proceed(
724743
# while in the other case below we just want to move on to shrinking.)
725744
if self.valid_examples >= self.settings.max_examples:
726745
self.exit_with(ExitReason.max_examples)
727-
if self.call_count >= max(
728-
self.settings.max_examples * 10,
729-
# We have a high-ish default max iterations, so that tests
730-
# don't become flaky when max_examples is too low.
731-
1000,
732-
):
746+
# Stop when we're 99% confident the true valid rate is below 1%.
747+
invalid_threshold = (
748+
INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
749+
)
750+
if (self.invalid_examples + self.overrun_examples) > invalid_threshold:
733751
self.exit_with(ExitReason.max_iterations)
734752

735753
if self.__tree_is_exhausted():
@@ -1088,8 +1106,12 @@ def should_generate_more(self) -> bool:
10881106
# but with the important distinction that this clause will move on to
10891107
# the shrinking phase having found one or more bugs, while the other
10901108
# will exit having found zero bugs.
1091-
if self.valid_examples >= self.settings.max_examples or self.call_count >= max(
1092-
self.settings.max_examples * 10, 1000
1109+
invalid_threshold = (
1110+
INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
1111+
)
1112+
if (
1113+
self.valid_examples >= self.settings.max_examples
1114+
or (self.invalid_examples + self.overrun_examples) > invalid_threshold
10931115
): # pragma: no cover
10941116
return False
10951117

hypothesis-python/tests/conjecture/test_engine.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status
3737
from hypothesis.internal.conjecture.datatree import compute_max_children
3838
from hypothesis.internal.conjecture.engine import (
39+
INVALID_PER_VALID,
40+
INVALID_THRESHOLD_BASE,
3941
MIN_TEST_CALLS,
4042
ConjectureRunner,
4143
ExitReason,
@@ -883,6 +885,52 @@ def f(data):
883885
assert runner.exit_reason == ExitReason.max_iterations
884886

885887

888+
def test_max_iterations_with_all_invalid():
889+
# With assume(False) on every example, we stop after INVALID_THRESHOLD_BASE + 1
890+
# invalid attempts (the check is > not >=).
891+
def f(data):
892+
data.draw_integer(0, 2**64 - 1)
893+
data.mark_invalid()
894+
895+
runner = ConjectureRunner(
896+
f,
897+
settings=settings(
898+
max_examples=10_000, database=None, suppress_health_check=list(HealthCheck)
899+
),
900+
)
901+
runner.run()
902+
903+
assert runner.call_count == INVALID_THRESHOLD_BASE + 1
904+
assert runner.exit_reason == ExitReason.max_iterations
905+
906+
907+
@pytest.mark.parametrize("n_valid", [1, 2, 5])
908+
def test_max_iterations_with_some_valid(n_valid):
909+
valid_count = 0
910+
911+
def f(data):
912+
nonlocal valid_count
913+
data.draw_integer(0, 2**64 - 1)
914+
if valid_count < n_valid:
915+
valid_count += 1
916+
else:
917+
data.mark_invalid()
918+
919+
runner = ConjectureRunner(
920+
f,
921+
settings=settings(
922+
max_examples=10_000, database=None, suppress_health_check=list(HealthCheck)
923+
),
924+
)
925+
runner.run()
926+
927+
assert (
928+
runner.call_count
929+
== n_valid + INVALID_THRESHOLD_BASE + n_valid * INVALID_PER_VALID + 1
930+
)
931+
assert runner.exit_reason == ExitReason.max_iterations
932+
933+
886934
def test_exit_because_shrink_phase_timeout(monkeypatch):
887935
val = 0
888936

@@ -1215,11 +1263,11 @@ def test(data):
12151263

12161264

12171265
def test_shrink_after_max_iterations():
1218-
"""If we find a bug, keep looking for more, and then hit the test call
1219-
limit, we should still proceed to shrinking.
1266+
"""If we find a bug, keep looking for more, and then hit the invalid
1267+
examples limit, we should still proceed to shrinking.
12201268
"""
12211269
max_examples = 10
1222-
max_iterations = max_examples * 10
1270+
max_iterations = INVALID_THRESHOLD_BASE
12231271
fail_at = max_iterations - 5
12241272

12251273
invalid = set()

hypothesis-python/tests/cover/test_testdecorators.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
strategies as st,
2626
)
2727
from hypothesis.errors import Unsatisfiable
28+
from hypothesis.internal.conjecture.engine import INVALID_THRESHOLD_BASE
2829
from hypothesis.strategies import (
2930
binary,
3031
booleans,
@@ -507,8 +508,8 @@ def f(v):
507508
with pytest.raises(
508509
Unsatisfiable,
509510
match=(
510-
r"Unable to satisfy assumptions of f\. 1000 of 1000 examples "
511-
r"failed a \.filter\(\) or assume\(\)"
511+
rf"Unable to satisfy assumptions of f\. {INVALID_THRESHOLD_BASE+1} of "
512+
rf"{INVALID_THRESHOLD_BASE+1} examples failed a \.filter\(\) or assume\(\)"
512513
),
513514
):
514515
f()
@@ -532,8 +533,8 @@ def f(v):
532533
pass
533534

534535
match = (
535-
r"1000 of 1000 examples were too large to finish generating; try "
536-
r"reducing the typical size of your inputs\?"
536+
rf"{INVALID_THRESHOLD_BASE+1} of {INVALID_THRESHOLD_BASE+1} examples were too large to"
537+
rf" finish generating; try reducing the typical size of your inputs\?"
537538
)
538539
with (
539540
pytest.raises(Unsatisfiable, match=match),

hypothesis-python/tests/nocover/test_conjecture_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_lot_of_dead_nodes():
2828
@run_to_nodes
2929
def nodes(data):
3030
for i in range(4):
31-
if data.draw_integer(0, 2**8 - 1) != i:
31+
if data.draw_integer(0, 2**7 - 1) != i:
3232
data.mark_invalid()
3333
data.mark_interesting(interesting_origin())
3434

hypothesis-python/tests/pytest/test_statistics.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,21 +53,21 @@ def test_prints_statistics_given_option(testdir):
5353
out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION)
5454
assert "Hypothesis Statistics" in out
5555
assert "max_examples=100" in out
56-
assert "< 10% of examples satisfied assumptions" in out
56+
assert "< 1% of examples satisfied assumptions" in out
5757

5858

5959
def test_prints_statistics_given_option_under_xdist(testdir):
6060
out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "-n", "2")
6161
assert "Hypothesis Statistics" in out
6262
assert "max_examples=100" in out
63-
assert "< 10% of examples satisfied assumptions" in out
63+
assert "< 1% of examples satisfied assumptions" in out
6464

6565

6666
def test_prints_statistics_given_option_with_junitxml(testdir):
6767
out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "--junit-xml=out.xml")
6868
assert "Hypothesis Statistics" in out
6969
assert "max_examples=100" in out
70-
assert "< 10% of examples satisfied assumptions" in out
70+
assert "< 1% of examples satisfied assumptions" in out
7171

7272

7373
@skipif_threading
@@ -80,7 +80,7 @@ def test_prints_statistics_given_option_under_xdist_with_junitxml(testdir):
8080
)
8181
assert "Hypothesis Statistics" in out
8282
assert "max_examples=100" in out
83-
assert "< 10% of examples satisfied assumptions" in out
83+
assert "< 1% of examples satisfied assumptions" in out
8484

8585

8686
UNITTEST_TESTSUITE = """

0 commit comments

Comments
 (0)