@@ -62,29 +62,104 @@ def f():
6262 yield i
6363 self .assertEqual (list (tuple (f ())), list (range (1000 )))
6464
65- def test_hash (self ):
66- # See SF bug 942952: Weakness in tuple hash
67- # The hash should:
68- # be non-commutative
69- # should spread-out closely spaced values
70- # should not exhibit cancellation in tuples like (x,(x,y))
71- # should be distinct from element hashes: hash(x)!=hash((x,))
72- # This test exercises those cases.
73- # For a pure random hash and N=50, the expected number of occupied
74- # buckets when tossing 252,600 balls into 2**32 buckets
75- # is 252,592.6, or about 7.4 expected collisions. The
76- # standard deviation is 2.73. On a box with 64-bit hash
77- # codes, no collisions are expected. Here we accept no
78- # more than 15 collisions. Any worse and the hash function
79- # is sorely suspect.
80-
65+ # Various tests for hashing of tuples to check that we get few collisions.
66+ #
67+ # Earlier versions of the tuple hash algorithm had collisions
68+ # reported at:
69+ # - https://bugs.python.org/issue942952
70+ # - https://bugs.python.org/issue34751
71+ #
72+ # Notes:
73+ # - The hash of tuples is deterministic: if the test passes once on a given
74+ # system, it will always pass. So the probabilities mentioned in the
75+ # test_hash functions below should be interpreted assuming that the
76+ # hashes are random.
77+ # - Due to the structure in the testsuite inputs, collisions are not
78+ # independent. For example, if hash((a,b)) == hash((c,d)), then also
79+ # hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
80+ # independence anyway.
81+ # - We limit the hash to 32 bits in the tests to have a good test on
82+ # 64-bit systems too. Furthermore, this is also a sanity check that the
83+ # lower 32 bits of a 64-bit hash are sufficiently random too.
84+ def test_hash1 (self ):
85+ # Check for hash collisions between small integers in range(50) and
86+ # certain tuples and nested tuples of such integers.
8187 N = 50
8288 base = list (range (N ))
8389 xp = [(i , j ) for i in base for j in base ]
8490 inps = base + [(i , j ) for i in base for j in xp ] + \
8591 [(i , j ) for i in xp for j in base ] + xp + list (zip (base ))
86- collisions = len (inps ) - len (set (map (hash , inps )))
87- self .assertTrue (collisions <= 15 )
92+ self .assertEqual (len (inps ), 252600 )
93+ hashes = set (hash (x ) % 2 ** 32 for x in inps )
94+ collisions = len (inps ) - len (hashes )
95+
96+ # For a pure random 32-bit hash and N = 252,600 test items, the
97+ # expected number of collisions equals
98+ #
99+ # 2**(-32) * N(N-1)/2 = 7.4
100+ #
101+ # We allow up to 15 collisions, which suffices to make the test
102+ # pass with 99.5% confidence.
103+ self .assertLessEqual (collisions , 15 )
104+
105+ def test_hash2 (self ):
106+ # Check for hash collisions between small integers (positive and
107+ # negative), tuples and nested tuples of such integers.
108+
109+ # All numbers in the interval [-n, ..., n] except -1 because
110+ # hash(-1) == hash(-2).
111+ n = 5
112+ A = [x for x in range (- n , n + 1 ) if x != - 1 ]
113+
114+ B = A + [(a ,) for a in A ]
115+
116+ L2 = [(a ,b ) for a in A for b in A ]
117+ L3 = L2 + [(a ,b ,c ) for a in A for b in A for c in A ]
118+ L4 = L3 + [(a ,b ,c ,d ) for a in A for b in A for c in A for d in A ]
119+
120+ # T = list of testcases. These consist of all (possibly nested
121+ # at most 2 levels deep) tuples containing at most 4 items from
122+ # the set A.
123+ T = A
124+ T += [(a ,) for a in B + L4 ]
125+ T += [(a ,b ) for a in L3 for b in B ]
126+ T += [(a ,b ) for a in L2 for b in L2 ]
127+ T += [(a ,b ) for a in B for b in L3 ]
128+ T += [(a ,b ,c ) for a in B for b in B for c in L2 ]
129+ T += [(a ,b ,c ) for a in B for b in L2 for c in B ]
130+ T += [(a ,b ,c ) for a in L2 for b in B for c in B ]
131+ T += [(a ,b ,c ,d ) for a in B for b in B for c in B for d in B ]
132+ self .assertEqual (len (T ), 345130 )
133+ hashes = set (hash (x ) % 2 ** 32 for x in T )
134+ collisions = len (T ) - len (hashes )
135+
136+ # For a pure random 32-bit hash and N = 345,130 test items, the
137+ # expected number of collisions equals
138+ #
139+ # 2**(-32) * N(N-1)/2 = 13.9
140+ #
141+ # We allow up to 20 collisions, which suffices to make the test
142+ # pass with 95.5% confidence.
143+ self .assertLessEqual (collisions , 20 )
144+
145+ def test_hash3 (self ):
146+ # Check for hash collisions between tuples containing 0.0 and 0.5.
147+ # The hashes of 0.0 and 0.5 itself differ only in one high bit.
148+ # So this implicitly tests propagation of high bits to low bits.
149+ from itertools import product
150+ T = list (product ([0.0 , 0.5 ], repeat = 18 ))
151+ self .assertEqual (len (T ), 262144 )
152+ hashes = set (hash (x ) % 2 ** 32 for x in T )
153+ collisions = len (T ) - len (hashes )
154+
155+ # For a pure random 32-bit hash and N = 262,144 test items, the
156+ # expected number of collisions equals
157+ #
158+ # 2**(-32) * N(N-1)/2 = 8.0
159+ #
160+ # We allow up to 15 collisions, which suffices to make the test
161+ # pass with 99.1% confidence.
162+ self .assertLessEqual (collisions , 15 )
88163
89164 def test_repr (self ):
90165 l0 = tuple ()
0 commit comments