bpo-34751: improved hash function for tuples

jdemeyer · jdemeyer · commit 075adf8b12e6 · 2018-10-23T22:24:42.000+02:00
diff --git a/Include/pyhash.h b/Include/pyhash.h
@@ -12,8 +12,8 @@ PyAPI_FUNC(Py_hash_t) _Py_HashPointer(void*);
 PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
 #endif
 
-/* Prime multiplier used in string and various other hashes. */
-#define _PyHASH_MULTIPLIER 1000003UL  /* 0xf4243 */
+/* Multiplier used for various hashes */
+#define _PyHASH_MULTIPLIER 1000003UL
 
 /* Parameters used for the numeric hash implementation.  See notes for
    _Py_HashDouble in Python/pyhash.c.  Numeric hashes are based on
diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py
@@ -62,29 +62,104 @@ def f():
                 yield i
         self.assertEqual(list(tuple(f())), list(range(1000)))
 
-    def test_hash(self):
-        # See SF bug 942952:  Weakness in tuple hash
-        # The hash should:
-        #      be non-commutative
-        #      should spread-out closely spaced values
-        #      should not exhibit cancellation in tuples like (x,(x,y))
-        #      should be distinct from element hashes:  hash(x)!=hash((x,))
-        # This test exercises those cases.
-        # For a pure random hash and N=50, the expected number of occupied
-        #      buckets when tossing 252,600 balls into 2**32 buckets
-        #      is 252,592.6, or about 7.4 expected collisions.  The
-        #      standard deviation is 2.73.  On a box with 64-bit hash
-        #      codes, no collisions are expected.  Here we accept no
-        #      more than 15 collisions.  Any worse and the hash function
-        #      is sorely suspect.
-
+    # Various tests for hashing of tuples to check that we get few collisions.
+    #
+    # Earlier versions of the tuple hash algorithm had collisions
+    # reported at:
+    # - https://bugs.python.org/issue942952
+    # - https://bugs.python.org/issue34751
+    #
+    # Notes:
+    # - The hash of tuples is deterministic: if the test passes once on a given
+    #   system, it will always pass. So the probabilities mentioned in the
+    #   test_hash functions below should be interpreted assuming that the
+    #   hashes are random.
+    # - Due to the structure in the testsuite inputs, collisions are not
+    #   independent. For example, if hash((a,b)) == hash((c,d)), then also
+    #   hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
+    #   independence anyway.
+    # - We limit the hash to 32 bits in the tests to have a good test on
+    #   64-bit systems too. Furthermore, this is also a sanity check that the
+    #   lower 32 bits of a 64-bit hash are sufficiently random too.
+    def test_hash1(self):
+        # Check for hash collisions between small integers in range(50) and
+        # certain tuples and nested tuples of such integers.
         N=50
         base = list(range(N))
         xp = [(i, j) for i in base for j in base]
         inps = base + [(i, j) for i in base for j in xp] + \
                      [(i, j) for i in xp for j in base] + xp + list(zip(base))
-        collisions = len(inps) - len(set(map(hash, inps)))
-        self.assertTrue(collisions <= 15)
+        self.assertEqual(len(inps), 252600)
+        hashes = set(hash(x) % 2**32 for x in inps)
+        collisions = len(inps) - len(hashes)
+
+        # For a pure random 32-bit hash and N = 252,600 test items, the
+        # expected number of collisions equals
+        #
+        # 2**(-32) * N(N-1)/2 = 7.4
+        #
+        # We allow up to 15 collisions, which suffices to make the test
+        # pass with 99.5% confidence.
+        self.assertLessEqual(collisions, 15)
+
+    def test_hash2(self):
+        # Check for hash collisions between small integers (positive and
+        # negative), tuples and nested tuples of such integers.
+
+        # All numbers in the interval [-n, ..., n] except -1 because
+        # hash(-1) == hash(-2).
+        n = 5
+        A = [x for x in range(-n, n+1) if x != -1]
+
+        B = A + [(a,) for a in A]
+
+        L2 = [(a,b) for a in A for b in A]
+        L3 = L2 + [(a,b,c) for a in A for b in A for c in A]
+        L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A]
+
+        # T = list of testcases. These consist of all (possibly nested
+        # at most 2 levels deep) tuples containing at most 4 items from
+        # the set A.
+        T = A
+        T += [(a,) for a in B + L4]
+        T += [(a,b) for a in L3 for b in B]
+        T += [(a,b) for a in L2 for b in L2]
+        T += [(a,b) for a in B for b in L3]
+        T += [(a,b,c) for a in B for b in B for c in L2]
+        T += [(a,b,c) for a in B for b in L2 for c in B]
+        T += [(a,b,c) for a in L2 for b in B for c in B]
+        T += [(a,b,c,d) for a in B for b in B for c in B for d in B]
+        self.assertEqual(len(T), 345130)
+        hashes = set(hash(x) % 2**32 for x in T)
+        collisions = len(T) - len(hashes)
+
+        # For a pure random 32-bit hash and N = 345,130 test items, the
+        # expected number of collisions equals
+        #
+        # 2**(-32) * N(N-1)/2 = 13.9
+        #
+        # We allow up to 20 collisions, which suffices to make the test
+        # pass with 95.5% confidence.
+        self.assertLessEqual(collisions, 20)
+
+    def test_hash3(self):
+        # Check for hash collisions between tuples containing 0.0 and 0.5.
+        # The hashes of 0.0 and 0.5 itself differ only in one high bit.
+        # So this implicitly tests propagation of high bits to low bits.
+        from itertools import product
+        T = list(product([0.0, 0.5], repeat=18))
+        self.assertEqual(len(T), 262144)
+        hashes = set(hash(x) % 2**32 for x in T)
+        collisions = len(T) - len(hashes)
+
+        # For a pure random 32-bit hash and N = 262,144 test items, the
+        # expected number of collisions equals
+        #
+        # 2**(-32) * N(N-1)/2 = 8.0
+        #
+        # We allow up to 15 collisions, which suffices to make the test
+        # pass with 99.1% confidence.
+        self.assertLessEqual(collisions, 15)
 
     def test_repr(self):
         l0 = tuple()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst b/Misc/NEWS.d/next/Core and Builtins/2018-09-20-15-41-58.bpo-34751.Yiv0pV.rst
@@ -0,0 +1,4 @@
+The hash function for tuples is now based on xxHash.
+This makes a high rate of collisions much less likely.
+Additionally, on 64-bit systems it improves tuple hashes in general.
+Patch by Jeroen Demeyer.
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
@@ -333,39 +333,63 @@ tuplerepr(PyTupleObject *v)
     return NULL;
 }
 
-/* The addend 82520, was selected from the range(0, 1000000) for
-   generating the greatest number of prime multipliers for tuples
-   up to length eight:
 
-     1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
-     1330111, 1412633, 1165069, 1247599, 1495177, 1577699
-
-   Tests have shown that it's not worth to cache the hash value, see
-   issue #9685.
+/* Hash for tuples. This is a slightly simplified version of the xxHash
+   non-cryptographic hash:
+   - we do not use any parallellism, there is only 1 accumulator.
+   - we drop the final mixing since this is just a permutation of the
+     output space: it does not help against collisions.
+   - at the end, we mangle the length with a single constant.
+   For the xxHash specification, see
+   https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md
+
+   Below are the official constants from the xxHash specification. Optimizing
+   compilers should emit a single "rotate" instruction for the
+   _PyHASH_XXROTATE() expansion. If that doesn't happen for some important
+   platform, the macro could be changed to expand to a platform-specific rotate
+   spelling instead.
+
+   NOTE: if xxHash is ever used for other hashes besides tuples, these macros
+   should be moved to Include/pyhash.h
 */
+#if SIZEOF_PY_UHASH_T > 4
+#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
+#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
+#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
+#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33))  /* Rotate left 31 bits */
+#else
+#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
+#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
+#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
+#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19))  /* Rotate left 13 bits */
+#endif
 
+/* Tests have shown that it's not worth to cache the hash value, see
+   https://bugs.python.org/issue9685 */
 static Py_hash_t
 tuplehash(PyTupleObject *v)
 {
-    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
-    Py_hash_t y;
-    Py_ssize_t len = Py_SIZE(v);
-    PyObject **p;
-    Py_uhash_t mult = _PyHASH_MULTIPLIER;
-    x = 0x345678UL;
-    p = v->ob_item;
-    while (--len >= 0) {
-        y = PyObject_Hash(*p++);
-        if (y == -1)
+    Py_ssize_t i, len = Py_SIZE(v);
+    PyObject **item = v->ob_item;
+
+    Py_uhash_t acc = _PyHASH_XXPRIME_5;
+    for (i = 0; i < len; i++) {
+        Py_uhash_t lane = PyObject_Hash(item[i]);
+        if (lane == (Py_uhash_t)-1) {
             return -1;
-        x = (x ^ y) * mult;
-        /* the cast might truncate len; that doesn't change hash stability */
-        mult += (Py_hash_t)(82520UL + len + len);
+        }
+        acc += lane * _PyHASH_XXPRIME_2;
+        acc = _PyHASH_XXROTATE(acc);
+        acc *= _PyHASH_XXPRIME_1;
+    }
+
+    /* Add input length, mangled to keep the historical value of hash(()). */
+    acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL);
+
+    if (acc == (Py_uhash_t)-1) {
+        return 1546275796;
     }
-    x += 97531UL;
-    if (x == (Py_uhash_t)-1)
-        x = -2;
-    return x;
+    return acc;
 }
 
 static Py_ssize_t