Merge pull request #4643 from ajdavis/issue-4623-filter-condition

Zac-HD · web-flow · commit 42126d6e2e82 · 2026-01-27T21:10:45.000-08:00
permit up to 99% assume() failures
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -8,6 +8,7 @@ their individual contributions.
 
 .. NOTE - this list is in alphabetical order by first name (or handle).
 
+* `A. Jesse Jiryu Davis <https://github.com/ajdavis>`_
 * `Aaron Meurer <https://github.com/asmeurer>`_
 * `Adam Johnson <https://github.com/adamchainz>`_
 * `Adam Matan <https://github.com/adamatan/adamatan>_`
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,8 @@
+RELEASE_TYPE: patch
+
+This patch makes Hypothesis more tolerant of slow-to-satisfy ``assume()`` calls.
+Previously, Hypothesis would give up after ``max_examples * 10`` attempts; now it
+uses a statistical test to stop only when 99% confident that <1% of examples
+would pass (:issue:`4623`).
+
+Thanks to @ajdavis for this improvement!
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -154,11 +154,30 @@ def timing_report(self) -> str:
         return "\n".join(out)
 
 
+# Stop when 99% confident the true valid rate is below 1%.
+# For k valid examples, we need n invalid such that:
+#     P(seeing <= k valid in n+k trials | rate=1%) <= 1%
+# k=0: (0.99)^n <= 0.01 -> n >= ln(0.01)/ln(0.99)
+# Each additional valid example adds ~ln(0.01)/ln(0.99)/3 to threshold.
+def _calculate_thresholds(
+    confidence: float = 0.99, min_valid_rate: float = 0.01
+) -> tuple[int, int]:
+    log_confidence = math.log(1 - confidence)
+    log_invalid_rate = math.log(1 - min_valid_rate)
+    base = math.ceil(log_confidence / log_invalid_rate)
+    # Approximate increase per valid example (from binomial CDF)
+    per_valid = math.ceil(base / 3)
+    return base, per_valid
+
+
+INVALID_THRESHOLD_BASE, INVALID_PER_VALID = _calculate_thresholds()
+
+
 class ExitReason(Enum):
     max_examples = "settings.max_examples={s.max_examples}"
     max_iterations = (
         "settings.max_examples={s.max_examples}, "
-        "but < 10% of examples satisfied assumptions"
+        "but < 1% of examples satisfied assumptions"
     )
     max_shrinks = f"shrunk example {MAX_SHRINKS} times"
     finished = "nothing left to do"
@@ -724,12 +743,11 @@ def _backend_cannot_proceed(
             #  while in the other case below we just want to move on to shrinking.)
             if self.valid_examples >= self.settings.max_examples:
                 self.exit_with(ExitReason.max_examples)
-            if self.call_count >= max(
-                self.settings.max_examples * 10,
-                # We have a high-ish default max iterations, so that tests
-                # don't become flaky when max_examples is too low.
-                1000,
-            ):
+            # Stop when we're 99% confident the true valid rate is below 1%.
+            invalid_threshold = (
+                INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
+            )
+            if (self.invalid_examples + self.overrun_examples) > invalid_threshold:
                 self.exit_with(ExitReason.max_iterations)
 
         if self.__tree_is_exhausted():
@@ -1088,8 +1106,12 @@ def should_generate_more(self) -> bool:
         # but with the important distinction that this clause will move on to
         # the shrinking phase having found one or more bugs, while the other
         # will exit having found zero bugs.
-        if self.valid_examples >= self.settings.max_examples or self.call_count >= max(
-            self.settings.max_examples * 10, 1000
+        invalid_threshold = (
+            INVALID_THRESHOLD_BASE + INVALID_PER_VALID * self.valid_examples
+        )
+        if (
+            self.valid_examples >= self.settings.max_examples
+            or (self.invalid_examples + self.overrun_examples) > invalid_threshold
         ):  # pragma: no cover
             return False
 
diff --git a/hypothesis-python/tests/conjecture/test_engine.py b/hypothesis-python/tests/conjecture/test_engine.py
@@ -36,6 +36,8 @@
 from hypothesis.internal.conjecture.data import ConjectureData, Overrun, Status
 from hypothesis.internal.conjecture.datatree import compute_max_children
 from hypothesis.internal.conjecture.engine import (
+    INVALID_PER_VALID,
+    INVALID_THRESHOLD_BASE,
     MIN_TEST_CALLS,
     ConjectureRunner,
     ExitReason,
@@ -883,6 +885,52 @@ def f(data):
     assert runner.exit_reason == ExitReason.max_iterations
 
 
+def test_max_iterations_with_all_invalid():
+    # With assume(False) on every example, we stop after INVALID_THRESHOLD_BASE + 1
+    # invalid attempts (the check is > not >=).
+    def f(data):
+        data.draw_integer(0, 2**64 - 1)
+        data.mark_invalid()
+
+    runner = ConjectureRunner(
+        f,
+        settings=settings(
+            max_examples=10_000, database=None, suppress_health_check=list(HealthCheck)
+        ),
+    )
+    runner.run()
+
+    assert runner.call_count == INVALID_THRESHOLD_BASE + 1
+    assert runner.exit_reason == ExitReason.max_iterations
+
+
+@pytest.mark.parametrize("n_valid", [1, 2, 5])
+def test_max_iterations_with_some_valid(n_valid):
+    valid_count = 0
+
+    def f(data):
+        nonlocal valid_count
+        data.draw_integer(0, 2**64 - 1)
+        if valid_count < n_valid:
+            valid_count += 1
+        else:
+            data.mark_invalid()
+
+    runner = ConjectureRunner(
+        f,
+        settings=settings(
+            max_examples=10_000, database=None, suppress_health_check=list(HealthCheck)
+        ),
+    )
+    runner.run()
+
+    assert (
+        runner.call_count
+        == n_valid + INVALID_THRESHOLD_BASE + n_valid * INVALID_PER_VALID + 1
+    )
+    assert runner.exit_reason == ExitReason.max_iterations
+
+
 def test_exit_because_shrink_phase_timeout(monkeypatch):
     val = 0
 
@@ -1215,11 +1263,11 @@ def test(data):
 
 
 def test_shrink_after_max_iterations():
-    """If we find a bug, keep looking for more, and then hit the test call
-    limit, we should still proceed to shrinking.
+    """If we find a bug, keep looking for more, and then hit the invalid
+    examples limit, we should still proceed to shrinking.
     """
     max_examples = 10
-    max_iterations = max_examples * 10
+    max_iterations = INVALID_THRESHOLD_BASE
     fail_at = max_iterations - 5
 
     invalid = set()
diff --git a/hypothesis-python/tests/cover/test_testdecorators.py b/hypothesis-python/tests/cover/test_testdecorators.py
@@ -25,6 +25,7 @@
     strategies as st,
 )
 from hypothesis.errors import Unsatisfiable
+from hypothesis.internal.conjecture.engine import INVALID_THRESHOLD_BASE
 from hypothesis.strategies import (
     binary,
     booleans,
@@ -507,8 +508,8 @@ def f(v):
     with pytest.raises(
         Unsatisfiable,
         match=(
-            r"Unable to satisfy assumptions of f\. 1000 of 1000 examples "
-            r"failed a \.filter\(\) or assume\(\)"
+            rf"Unable to satisfy assumptions of f\. {INVALID_THRESHOLD_BASE+1} of "
+            rf"{INVALID_THRESHOLD_BASE+1} examples failed a \.filter\(\) or assume\(\)"
         ),
     ):
         f()
@@ -532,8 +533,8 @@ def f(v):
         pass
 
     match = (
-        r"1000 of 1000 examples were too large to finish generating; try "
-        r"reducing the typical size of your inputs\?"
+        rf"{INVALID_THRESHOLD_BASE+1} of {INVALID_THRESHOLD_BASE+1} examples were too large to"
+        rf" finish generating; try reducing the typical size of your inputs\?"
     )
     with (
         pytest.raises(Unsatisfiable, match=match),
diff --git a/hypothesis-python/tests/nocover/test_conjecture_engine.py b/hypothesis-python/tests/nocover/test_conjecture_engine.py
@@ -28,7 +28,7 @@ def test_lot_of_dead_nodes():
     @run_to_nodes
     def nodes(data):
         for i in range(4):
-            if data.draw_integer(0, 2**8 - 1) != i:
+            if data.draw_integer(0, 2**7 - 1) != i:
                 data.mark_invalid()
         data.mark_interesting(interesting_origin())
 
diff --git a/hypothesis-python/tests/pytest/test_statistics.py b/hypothesis-python/tests/pytest/test_statistics.py
@@ -53,21 +53,21 @@ def test_prints_statistics_given_option(testdir):
     out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION)
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 def test_prints_statistics_given_option_under_xdist(testdir):
     out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "-n", "2")
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 def test_prints_statistics_given_option_with_junitxml(testdir):
     out = get_output(testdir, TESTSUITE, PRINT_STATISTICS_OPTION, "--junit-xml=out.xml")
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 @skipif_threading
@@ -80,7 +80,7 @@ def test_prints_statistics_given_option_under_xdist_with_junitxml(testdir):
     )
     assert "Hypothesis Statistics" in out
     assert "max_examples=100" in out
-    assert "< 10% of examples satisfied assumptions" in out
+    assert "< 1% of examples satisfied assumptions" in out
 
 
 UNITTEST_TESTSUITE = """