Skip to content

Commit 075adf8

Browse files
committed
bpo-34751: improved hash function for tuples
1 parent 121eb16 commit 075adf8

File tree

4 files changed

+148
-45
lines changed

4 files changed

+148
-45
lines changed

Include/pyhash.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ PyAPI_FUNC(Py_hash_t) _Py_HashPointer(void*);
1212
PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
1313
#endif
1414

15-
/* Prime multiplier used in string and various other hashes. */
16-
#define _PyHASH_MULTIPLIER 1000003UL /* 0xf4243 */
15+
/* Multiplier used for various hashes */
16+
#define _PyHASH_MULTIPLIER 1000003UL
1717

1818
/* Parameters used for the numeric hash implementation. See notes for
1919
_Py_HashDouble in Python/pyhash.c. Numeric hashes are based on

Lib/test/test_tuple.py

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -62,29 +62,104 @@ def f():
6262
yield i
6363
self.assertEqual(list(tuple(f())), list(range(1000)))
6464

65-
def test_hash(self):
66-
# See SF bug 942952: Weakness in tuple hash
67-
# The hash should:
68-
# be non-commutative
69-
# should spread-out closely spaced values
70-
# should not exhibit cancellation in tuples like (x,(x,y))
71-
# should be distinct from element hashes: hash(x)!=hash((x,))
72-
# This test exercises those cases.
73-
# For a pure random hash and N=50, the expected number of occupied
74-
# buckets when tossing 252,600 balls into 2**32 buckets
75-
# is 252,592.6, or about 7.4 expected collisions. The
76-
# standard deviation is 2.73. On a box with 64-bit hash
77-
# codes, no collisions are expected. Here we accept no
78-
# more than 15 collisions. Any worse and the hash function
79-
# is sorely suspect.
80-
65+
# Various tests for hashing of tuples to check that we get few collisions.
66+
#
67+
# Earlier versions of the tuple hash algorithm had collisions
68+
# reported at:
69+
# - https://bugs.python.org/issue942952
70+
# - https://bugs.python.org/issue34751
71+
#
72+
# Notes:
73+
# - The hash of tuples is deterministic: if the test passes once on a given
74+
# system, it will always pass. So the probabilities mentioned in the
75+
# test_hash functions below should be interpreted assuming that the
76+
# hashes are random.
77+
# - Due to the structure in the testsuite inputs, collisions are not
78+
# independent. For example, if hash((a,b)) == hash((c,d)), then also
79+
# hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
80+
# independence anyway.
81+
# - We limit the hash to 32 bits in the tests to have a good test on
82+
# 64-bit systems too. Furthermore, this is also a sanity check that the
83+
# lower 32 bits of a 64-bit hash are sufficiently random too.
84+
def test_hash1(self):
85+
# Check for hash collisions between small integers in range(50) and
86+
# certain tuples and nested tuples of such integers.
8187
N=50
8288
base = list(range(N))
8389
xp = [(i, j) for i in base for j in base]
8490
inps = base + [(i, j) for i in base for j in xp] + \
8591
[(i, j) for i in xp for j in base] + xp + list(zip(base))
86-
collisions = len(inps) - len(set(map(hash, inps)))
87-
self.assertTrue(collisions <= 15)
92+
self.assertEqual(len(inps), 252600)
93+
hashes = set(hash(x) % 2**32 for x in inps)
94+
collisions = len(inps) - len(hashes)
95+
96+
# For a pure random 32-bit hash and N = 252,600 test items, the
97+
# expected number of collisions equals
98+
#
99+
# 2**(-32) * N(N-1)/2 = 7.4
100+
#
101+
# We allow up to 15 collisions, which suffices to make the test
102+
# pass with 99.5% confidence.
103+
self.assertLessEqual(collisions, 15)
104+
105+
def test_hash2(self):
106+
# Check for hash collisions between small integers (positive and
107+
# negative), tuples and nested tuples of such integers.
108+
109+
# All numbers in the interval [-n, ..., n] except -1 because
110+
# hash(-1) == hash(-2).
111+
n = 5
112+
A = [x for x in range(-n, n+1) if x != -1]
113+
114+
B = A + [(a,) for a in A]
115+
116+
L2 = [(a,b) for a in A for b in A]
117+
L3 = L2 + [(a,b,c) for a in A for b in A for c in A]
118+
L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A]
119+
120+
# T = list of testcases. These consist of all (possibly nested
121+
# at most 2 levels deep) tuples containing at most 4 items from
122+
# the set A.
123+
T = A
124+
T += [(a,) for a in B + L4]
125+
T += [(a,b) for a in L3 for b in B]
126+
T += [(a,b) for a in L2 for b in L2]
127+
T += [(a,b) for a in B for b in L3]
128+
T += [(a,b,c) for a in B for b in B for c in L2]
129+
T += [(a,b,c) for a in B for b in L2 for c in B]
130+
T += [(a,b,c) for a in L2 for b in B for c in B]
131+
T += [(a,b,c,d) for a in B for b in B for c in B for d in B]
132+
self.assertEqual(len(T), 345130)
133+
hashes = set(hash(x) % 2**32 for x in T)
134+
collisions = len(T) - len(hashes)
135+
136+
# For a pure random 32-bit hash and N = 345,130 test items, the
137+
# expected number of collisions equals
138+
#
139+
# 2**(-32) * N(N-1)/2 = 13.9
140+
#
141+
# We allow up to 20 collisions, which suffices to make the test
142+
# pass with 95.5% confidence.
143+
self.assertLessEqual(collisions, 20)
144+
145+
def test_hash3(self):
146+
# Check for hash collisions between tuples containing 0.0 and 0.5.
147+
# The hashes of 0.0 and 0.5 itself differ only in one high bit.
148+
# So this implicitly tests propagation of high bits to low bits.
149+
from itertools import product
150+
T = list(product([0.0, 0.5], repeat=18))
151+
self.assertEqual(len(T), 262144)
152+
hashes = set(hash(x) % 2**32 for x in T)
153+
collisions = len(T) - len(hashes)
154+
155+
# For a pure random 32-bit hash and N = 262,144 test items, the
156+
# expected number of collisions equals
157+
#
158+
# 2**(-32) * N(N-1)/2 = 8.0
159+
#
160+
# We allow up to 15 collisions, which suffices to make the test
161+
# pass with 99.1% confidence.
162+
self.assertLessEqual(collisions, 15)
88163

89164
def test_repr(self):
90165
l0 = tuple()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The hash function for tuples is now based on xxHash.
2+
This makes a high rate of collisions much less likely.
3+
Additionally, on 64-bit systems it improves tuple hashes in general.
4+
Patch by Jeroen Demeyer.

Objects/tupleobject.c

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -333,39 +333,63 @@ tuplerepr(PyTupleObject *v)
333333
return NULL;
334334
}
335335

336-
/* The addend 82520, was selected from the range(0, 1000000) for
337-
generating the greatest number of prime multipliers for tuples
338-
up to length eight:
339336

340-
1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
341-
1330111, 1412633, 1165069, 1247599, 1495177, 1577699
342-
343-
Tests have shown that it's not worth to cache the hash value, see
344-
issue #9685.
337+
/* Hash for tuples. This is a slightly simplified version of the xxHash
338+
non-cryptographic hash:
339+
- we do not use any parallellism, there is only 1 accumulator.
340+
- we drop the final mixing since this is just a permutation of the
341+
output space: it does not help against collisions.
342+
- at the end, we mangle the length with a single constant.
343+
For the xxHash specification, see
344+
https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md
345+
346+
Below are the official constants from the xxHash specification. Optimizing
347+
compilers should emit a single "rotate" instruction for the
348+
_PyHASH_XXROTATE() expansion. If that doesn't happen for some important
349+
platform, the macro could be changed to expand to a platform-specific rotate
350+
spelling instead.
351+
352+
NOTE: if xxHash is ever used for other hashes besides tuples, these macros
353+
should be moved to Include/pyhash.h
345354
*/
355+
#if SIZEOF_PY_UHASH_T > 4
356+
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
357+
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
358+
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
359+
#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
360+
#else
361+
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
362+
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
363+
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
364+
#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
365+
#endif
346366

367+
/* Tests have shown that it's not worth to cache the hash value, see
368+
https://bugs.python.org/issue9685 */
347369
static Py_hash_t
348370
tuplehash(PyTupleObject *v)
349371
{
350-
Py_uhash_t x; /* Unsigned for defined overflow behavior. */
351-
Py_hash_t y;
352-
Py_ssize_t len = Py_SIZE(v);
353-
PyObject **p;
354-
Py_uhash_t mult = _PyHASH_MULTIPLIER;
355-
x = 0x345678UL;
356-
p = v->ob_item;
357-
while (--len >= 0) {
358-
y = PyObject_Hash(*p++);
359-
if (y == -1)
372+
Py_ssize_t i, len = Py_SIZE(v);
373+
PyObject **item = v->ob_item;
374+
375+
Py_uhash_t acc = _PyHASH_XXPRIME_5;
376+
for (i = 0; i < len; i++) {
377+
Py_uhash_t lane = PyObject_Hash(item[i]);
378+
if (lane == (Py_uhash_t)-1) {
360379
return -1;
361-
x = (x ^ y) * mult;
362-
/* the cast might truncate len; that doesn't change hash stability */
363-
mult += (Py_hash_t)(82520UL + len + len);
380+
}
381+
acc += lane * _PyHASH_XXPRIME_2;
382+
acc = _PyHASH_XXROTATE(acc);
383+
acc *= _PyHASH_XXPRIME_1;
384+
}
385+
386+
/* Add input length, mangled to keep the historical value of hash(()). */
387+
acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL);
388+
389+
if (acc == (Py_uhash_t)-1) {
390+
return 1546275796;
364391
}
365-
x += 97531UL;
366-
if (x == (Py_uhash_t)-1)
367-
x = -2;
368-
return x;
392+
return acc;
369393
}
370394

371395
static Py_ssize_t

0 commit comments

Comments
 (0)