python · tim-one · May 19, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/Lib/_pylong.py b/Lib/_pylong.py
@@ -211,6 +211,137 @@ def inner(a, b):
     return inner(0, len(s))
 
 
+# Asymptotically faster version, using the C decimal module. See
+# comments at the end of the file. This uses decimal arithmetic to
+# convert from base 10 to base 256. The latter is just a string of
+# bytes, which CPython can convert very efficiently to a Python int.
+
+# log of 10 to base 256 with best-possible 53-bit precision. Obtained
+# via:
+#    from mpmath import mp
+#    mp.prec = 1000
+#    print(float(mp.log(10, 256)).hex())
+_LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415
+
+# _spread is for internal testing. It maps a key to the number of times
+# that condition obtained in _dec_str_to_int_inner:
+#     key 0 - quotient guess was right
+#     key 1 - quotient had to be boosted by 1, one time
+#     key 999 - one adjustment wasn't enough, so fell back to divmod
+from collections import defaultdict
+_spread = defaultdict(int)
+del defaultdict
+
+def _dec_str_to_int_inner(s, *, GUARD=8):
+    BYTELIM = 512
+    D = decimal.Decimal
+    result = bytearray()
+    # See notes at end of file for discussion of GUARD.
+    assert GUARD > 0 # if 0, `decimal` can blow up - .prec 0 not allowed
+
+    def inner(n, w):
+        #assert n < D256 ** w # required, but too expensive to check
+        if w <= BYTELIM:
+            # XXX Stefan Pochmann discovered that, for 1024-bit ints,
+            # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`.
+            # So simplify this code to the former if/when that gets
+            # repaired.
+            result.extend(int(str(n)).to_bytes(w)) # big-endian default
+            return
+        w2 = w >> 1
+        if 0:
+            # This is maximally clear, but "too slow". `decimal`
+            # division is asymptotically fast, but we have no way to
+            # tell it to reuse the high-precision reciprocal it computes
+            # for pow256[w2], so it has to recompute it over & over &
+            # over again :-(
+            hi, lo = divmod(n, pow256[w2][0])
+        else:
+            p256, recip = pow256[w2]
+            # The integer part will have a number of digits about equal
+            # to the difference between the log10s of `n` and `pow256`
+            # (which, since these are integers, is roughly approximated
+            # by `.adjusted()`). That's the working precision we need,
+            ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD
+            hi = +n * +recip # unary `+` chops back to ctx.prec digits
+            ctx.prec = decimal.MAX_PREC
+            hi = hi.to_integral_value() # lose the fractional digits
+            lo = n - hi * p256
+            # Because we've been uniformly rounding down, `hi` is a
+            # lower bound on the correct quotient.
+            assert lo >= 0
+            # Adjust quotient up if needed. It usually isn't. In random
+            # testing on inputs through 2.5 billion digit strings, the
+            # test triggered about one in 100 thousand cases.
+            count = 0
+            if lo >= p256:
+                count = 1
+                lo -= p256
+                hi += 1
+                if lo >= p256:
+                    # Complete correction via an exact computation. I
+                    # believe it's not possible to get here provided
+                    # GUARD >= 3. It's tested by reducing GUARD below
+                    # that.
+                    count = 999
+                    hi2, lo = divmod(lo, p256)
+                    hi += hi2
+            _spread[count] += 1
+            # The assert should always succeed, but way too slow to keep
+            # enabled.
+            #assert hi, lo == divmod(n, pow256[w2][0])
+        inner(hi, w - w2)
+        inner(lo, w2)
+
+    # How many base 256 digits are needed?. Mathematically, exactly
+    # floor(log256(int(s))) + 1. There is no cheap way to compute this.
+    # But we can get an upper bound, and that's necessary for our error
+    # analysis to make sense. int(s) < 10**len(s), so the log needed is
+    # < log256(10**len(s)) = len(s) * log256(10). However, using
+    # finite-precision floating point for this, it's possible that the
+    # computed value is a little less than the true value. If the true
+    # value is at - or a little higher than - an integer, we can get an
+    # off-by-1 error too low. So we add 2 instead of 1 if chopping lost
+    # a fraction > 0.9.
+
+    # The "WASI" test platfrom can complain about `len(s)` if it's too
+    # large to fit in its idea of "an index-sized integer".
+    lenS = s.__len__()
+    log_ub = lenS * _LOG_10_BASE_256
+    log_ub_as_int = int(log_ub)
+    w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9)
+    # And what if we've plain exhausted the limits of HW floats? We
+    # could compute the log to any desired precision using `decimal`,
+    # but it's not plausible that anyone will pass a string requiring
+    # trillions of bytes (unles they're just trying to "break things").
+    if w.bit_length() >= 46:
+        # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double.
+        raise ValueError(f"cannot convert string of len {lenS} to int")
+    with decimal.localcontext(_unbounded_dec_context) as ctx:
+        D256 = D(256)
+        pow256 = compute_powers(w, D256, BYTELIM)
+        rpow256 = compute_powers(w, 1 / D256, BYTELIM)
+        # We're going to do inexact, chopped arithmetic, multiplying by
+        # an approximation to the reciprocal of 256**i. We chop to get a
+        # lower bound on the true integer quotient. Our approximation is
+        # a lower bound, the multiplication is chopped too, and
+        # to_integral_value() is also chopped.
+        ctx.traps[decimal.Inexact] = 0
+        ctx.rounding = decimal.ROUND_DOWN
+        for k, v in pow256.items():
+            # No need to save much more precision in the reciprocal than
+            # the power of 256 has, plus some guard digits to absorb
+            # most relevant rounding errors. This is highly signficant:
+            # 1/2**i has the same number of significant decimal digits
+            # as 5**i, generally over twice the number in 2**i,
+            ctx.prec = v.adjusted() + GUARD + 2
+            # The unary "+" chope the reciprocal back to that precision.
+            pow256[k] = v, +rpow256[k]
+        del rpow256 # exact reciprocals no longer needed
+        ctx.prec = decimal.MAX_PREC
+        inner(D(s), w)
+    return int.from_bytes(result)
+
 def int_from_string(s):
     """Asymptotically fast version of PyLong_FromString(), conversion
     of a string of decimal digits into an 'int'."""
@@ -219,7 +350,10 @@ def int_from_string(s):
     # and underscores, and stripped leading whitespace.  The input can still
     # contain underscores and have trailing whitespace.
     s = s.rstrip().replace('_', '')
-    return _str_to_int_inner(s)
+    func = _str_to_int_inner
+    if len(s) >= 3_500_000 and _decimal is not None:
+        func = _dec_str_to_int_inner
+    return func(s)
 
 def str_to_int(s):
     """Asymptotically fast version of decimal string to 'int' conversion."""
@@ -361,3 +495,101 @@ def int_divmod(a, b):
         return ~q, b + ~r
     else:
         return _divmod_pos(a, b)
+
+
+# Notes on _dec_str_to_int_inner:
+#
+# Stefan Pochmann worked up a str->int function that used the decimal
+# module to, in effect, convert from base 10 to base 256. This is
+# "unnatural", in that it requires multiplying and dividing by large
+# powers of 2, which `decimal` isn't naturally suited to. But
+# `decimal`'s `*` and `/` are asymptotically superior to CPython's, so
+# at _some_ point it could be expected to win.
+#
+# Alas, the crossover point was too high to be of much real interest. I
+# (Tim) then worked on ways to replace its division with multiplication
+# by a cached reciprocal approximation instead, fixing up errors
+# afterwards. This reduced the crossover point significantly,
+#
+# I revisited the code, and found ways to improve and simplify it. The
+# crossover point is at about 3.4 million digits now.
+#
+# GUARD digits
+# ------------
+# We only want the integer part of divisions, so don't need to build
+# the full multiplication tree. But using _just_ the number of
+# digits expected in the integer part ignores too much. What's left
+# out can have a very significant effect on the quotient. So we use
+# GUARD additional digits.
+#
+# The default 8 is more than enough so no more than 1 correction step
+# was ever needed for all inputs tried through 2.5 billion digita. In
+# fact, I believe 5 guard digits are always enough - but the proof is
+# very involved, so better safe than sorry.
+#
+# Short course:
+#
+# If prec is the decimal precision in effect, and we're rounding down,
+# the result of an operation is exactly equal to the infinitely precise
+# result times 1-e for some real e with 0 <= e < 10**(1-prec). We have
+# 3 operations: chopping n back to prec digits, likewise for 1/256**w2,
+# and also for their product.
+#
+# So the computed product is exactly equal to the true product times
+# (1-e1)*(1-e2)*(1-e3); since the e's are all very small, an excellent
+# approximation to the second factor is 1-(e1+e2+e3) (the 2nd and 3rd
+# order terms in the expanded product are too tiny to matter). If
+# they're all as large as possible, that's 1 - 3*10**(1-prec). This,
+# BTW, is all bog-standard FP error analysis.
+#
+# That implies the computed product is within 1 of the true product
+# provided prec >= log10(true_product) + 1.47712125.
+#
+# Here are telegraphic details, rephrasing the initial condition in
+# equivalent ways, step by step:
+#
+# prod - prod * (1 - 3*10**(1-prec)) <= 1
+# prod - prod + prod * 3*10**(1-prec)) <= 1
+# prod * 3*10**(1-prec)) <= 1
+# 10**(log10(prod)) * 3*10**(1-prec)) <= 1
+# 3*10**(1-prec+log10(prod))) <= 1
+# 10**(1-prec+log10(prod))) <= 1/3
+# 1-prec+log10(prod) <= log10(1/3) = -0.47712125
+# -prec <= -1.47712125 - log10(prod)
+# prec >= log10(prod) + 1.47712125
+#
+# n.adjusted() - p256.adjusted() is s crude integer approximation to
+# log10(true_product) - but prec is necessarily an int too, and via
+# tedious case analysis it can be shown that the "crude xpproximation"
+# is never smaller than the floor of the true ratio's log10. For
+# exxmple, in 8E20 / 1E20, it gives 20 - 20 = 0, which is the floor of
+# log10(9), It also giver 0 for 1E20/9E20 (`.adjusted()` doesn't look at
+# the digits at all - it just gives the power-of-10 exponent of the most
+# significant digit, whatever it may be). But in that case it's the
+# ceiling of the true log10 (which is a bit larger than -1). So "it's
+# close", but since it may be as bad as (but no worse than) 1 too small,
+# we have to assume the worst: 1 too small.
+#
+# Also skipping why cutting the reciprocal to p256.adjusted() + GUARD
+# digits to begin with is good enough. The precondition n < 256**w is
+# needed to establish that the true product can't be too large for the
+# reciprocal approximation to be too narrow. But read on for more ;-)
+#
+# But since this is just a sketch of a proof ;-), the code uses the
+# empirically tested 8 instead of 5. 3 digits more or less makes no
+# practical difference to speed - these ints are huge. And while
+# increasing GUARD above 5 may not be necessary, every increase cuts the
+# percentage of cases that need a correction at all.
+#
+# LATER: doing this analysis pointed out an error: our division isn't
+# exactly "balanced", in that when `w` is odd the integer part of
+# n/256**w2 can be larger than 256**w2. The code used enough working
+# precision in the multiply then, but the precommputed reciprocal
+# approximation didn't have that many good digits to give. This was
+# repaired by retaining 2 more digits in the reciprocal.
+#
+# After that, I believe GUARD=3 should be enough. Which was "the
+# obvious" conclusion I leaped to after deriving `prec >= log10(prod) +
+# 1.47712125` (adding the fractional part of the log to 1.47 ... could
+# push that over 2, and then the ceiling is needed to get an integer >=
+# to that). But, at that time, I knew GUARDs of 3 and 4 "didn't work".
diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py
@@ -919,5 +919,57 @@ def test_pylong_roundtrip(self):
             self.assertEqual(n, int(sn))
             bits <<= 1
 
+    @support.requires_resource('cpu')
+    def test_pylong_roundtrip_huge(self):
+        # k blocks of 1234567890
+        k = 1_000_000 # so 10 million digits in all
+        tentoten = 10**10
+        n = 1234567890 * ((tentoten**k - 1) // (tentoten - 1))
+        sn = "1234567890" * k
+        self.assertEqual(n, int(sn))
+        self.assertEqual(sn, str(n))
+
+    @support.requires_resource('cpu')
+    @unittest.skipUnless(_pylong, "_pylong module required")
+    def test_whitebox_dec_str_to_int_inner_failsafe(self):
+        # While I believe the number of GUARD digits in this function is
+        # always enough so that no more than one correction step is ever
+        # needed, the code has a "failsafe" path that takes over if I'm
+        # wrong about that. We have no input that reaches that block.
+        # Here we test a contrived input that _does_ reach that block,
+        # provided the number of guard digits is reduced to 1.
+        sn = "6" * (4000000 - 1)
+        n = (10**len(sn) - 1) // 9 * 6
+        orig_spread = _pylong._spread.copy()
+        _pylong._spread.clear()
+        try:
+            self.assertEqual(n, _pylong._dec_str_to_int_inner(sn, GUARD=1))
+            self.assertIn(999, _pylong._spread)
+        finally:
+            _pylong._spread.clear()
+            _pylong._spread.update(orig_spread)
+
+    @unittest.skipUnless(_pylong, "pylong module required")
+    def test_whitebox_dec_str_to_int_inner_monster(self):
+        # I don't think anyone has enough RAM to build a string long enough
+        # for this function to complain. So lie about the string length.
+
+        class LyingStr(str):
+            def __len__(self):
+                return int((1 << 47) / _pylong._LOG_10_BASE_256)
+
+        liar = LyingStr("42")
+        # We have to pass the liar directly to the complaining function. If we
+        # just try `int(liar)`, earlier layers will replace it with plain old
+        # "43".
+        # Embedding `len(liar)` into the f-string failed on the WASI testbot
+        # (don't know what that is):
+        # OverflowError: cannot fit 'int' into an index-sized integer
+        # So a random stab at worming around that.
+        self.assertRaisesRegex(ValueError,
+            f"^cannot convert string of len {liar.__len__()} to int$",
+            _pylong._dec_str_to_int_inner,
+            liar)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst
@@ -0,0 +1 @@
+If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 3.5 million digits.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 3.5 million digits.