2525
2626import fileinput , re , os , sys , operator
2727
28+ bytes_old = 0
29+ bytes_new = 0
30+
2831preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2932// file at the top-level directory of this distribution and at
3033// http://rust-lang.org/COPYRIGHT.
@@ -307,12 +310,137 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
307310 format_table_content (f , data , 8 )
308311 f .write ("\n ];\n \n " )
309312
313+ def emit_trie_lookup_range_table (f ):
314+ f .write ("""
315+
316+ // BoolTrie is a trie for representing a set of Unicode codepoints. It is
317+ // implemented with postfix compression (sharing of identical child nodes),
318+ // which gives both compact size and fast lookup.
319+ //
320+ // The space of Unicode codepoints is divided into 3 subareas, each
321+ // represented by a trie with different depth. In the first (0..0x800), there
322+ // is no trie structure at all; each u64 entry corresponds to a bitvector
323+ // effectively holding 64 bool values.
324+ //
325+ // In the second (0x800..0x10000), each child of the root node represents a
326+ // 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
327+ // the trie stores an 8-bit index into a shared table of leaf values. This
328+ // exploits the fact that in reasonable sets, many such leaves can be shared.
329+ //
330+ // In the third (0x10000..0x110000), each child of the root node represents a
331+ // 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
332+ // of a child tree. Each of these 64 bytes represents an index into the table
333+ // of shared 64-bit leaf values. This exploits the sparse structure in the
334+ // non-BMP range of most Unicode sets.
335+ pub struct BoolTrie {
336+ // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
337+ r1: [u64; 32], // leaves
338+
339+ // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
340+ r2: [u8; 992], // first level
341+ r3: &'static [u64], // leaves
342+
343+ // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
344+ r4: [u8; 256], // first level
345+ r5: &'static [u8], // second level
346+ r6: &'static [u64], // leaves
347+ }
348+
349+ fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
350+ ((bitmap_chunk >> (c & 63)) & 1) != 0
351+ }
352+
353+ fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
354+ let c = c as usize;
355+ if c < 0x800 {
356+ trie_range_leaf(c, r.r1[c >> 6])
357+ } else if c < 0x10000 {
358+ let child = r.r2[(c >> 6) - 0x20];
359+ trie_range_leaf(c, r.r3[child as usize])
360+ } else {
361+ let child = r.r4[(c >> 12) - 0x10];
362+ let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
363+ trie_range_leaf(c, r.r6[leaf as usize])
364+ }
365+ }\n
366+ """ )
367+
368+ def compute_trie (rawdata , chunksize ):
369+ root = []
370+ childmap = {}
371+ child_data = []
372+ for i in range (len (rawdata ) / chunksize ):
373+ data = rawdata [i * chunksize : (i + 1 ) * chunksize ]
374+ child = '|' .join (map (str , data ))
375+ if child not in childmap :
376+ childmap [child ] = len (childmap )
377+ child_data .extend (data )
378+ root .append (childmap [child ])
379+ return (root , child_data )
380+
381+ def emit_bool_trie (f , name , t_data , is_pub = True ):
382+ global bytes_old , bytes_new
383+ bytes_old += 8 * len (t_data )
384+ CHUNK = 64
385+ rawdata = [False ] * 0x110000 ;
386+ for (lo , hi ) in t_data :
387+ for cp in range (lo , hi + 1 ):
388+ rawdata [cp ] = True
389+
390+ # convert to bitmap chunks of 64 bits each
391+ chunks = []
392+ for i in range (0x110000 / CHUNK ):
393+ chunk = 0
394+ for j in range (64 ):
395+ if rawdata [i * 64 + j ]:
396+ chunk |= 1 << j
397+ chunks .append (chunk )
398+
399+ pub_string = ""
400+ if is_pub :
401+ pub_string = "pub "
402+ f .write (" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n " % (pub_string , name ))
403+ f .write (" r1: [\n " )
404+ data = ',' .join ('0x%016x' % chunk for chunk in chunks [0 :0x800 / CHUNK ])
405+ format_table_content (f , data , 12 )
406+ f .write ("\n ],\n " )
407+
408+ # 0x800..0x10000 trie
409+ (r2 , r3 ) = compute_trie (chunks [0x800 / CHUNK : 0x10000 / CHUNK ], 64 / CHUNK )
410+ f .write (" r2: [\n " )
411+ data = ',' .join (str (node ) for node in r2 )
412+ format_table_content (f , data , 12 )
413+ f .write ("\n ],\n " )
414+ f .write (" r3: &[\n " )
415+ data = ',' .join ('0x%016x' % chunk for chunk in r3 )
416+ format_table_content (f , data , 12 )
417+ f .write ("\n ],\n " )
418+
419+ # 0x10000..0x110000 trie
420+ (mid , r6 ) = compute_trie (chunks [0x10000 / CHUNK : 0x110000 / CHUNK ], 64 / CHUNK )
421+ (r4 , r5 ) = compute_trie (mid , 64 )
422+ f .write (" r4: [\n " )
423+ data = ',' .join (str (node ) for node in r4 )
424+ format_table_content (f , data , 12 )
425+ f .write ("\n ],\n " )
426+ f .write (" r5: &[\n " )
427+ data = ',' .join (str (node ) for node in r5 )
428+ format_table_content (f , data , 12 )
429+ f .write ("\n ],\n " )
430+ f .write (" r6: &[\n " )
431+ data = ',' .join ('0x%016x' % chunk for chunk in r6 )
432+ format_table_content (f , data , 12 )
433+ f .write ("\n ],\n " )
434+
435+ f .write (" };\n \n " )
436+ bytes_new += 256 + 992 + 256 + 8 * len (r3 ) + len (r5 ) + 8 * len (r6 )
437+
310438def emit_property_module (f , mod , tbl , emit ):
311439 f .write ("pub mod %s {\n " % mod )
312440 for cat in sorted (emit ):
313- emit_table (f , "%s_table" % cat , tbl [cat ])
441+ emit_bool_trie (f , "%s_table" % cat , tbl [cat ])
314442 f .write (" pub fn %s(c: char) -> bool {\n " % cat )
315- f .write (" super::bsearch_range_table (c, %s_table)\n " % cat )
443+ f .write (" super::trie_lookup_range_table (c, %s_table)\n " % cat )
316444 f .write (" }\n \n " )
317445 f .write ("}\n \n " )
318446
@@ -402,8 +530,9 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
402530 norm_props = load_properties ("DerivedNormalizationProps.txt" ,
403531 ["Full_Composition_Exclusion" ])
404532
405- # bsearch_range_table is used in all the property modules below
406- emit_bsearch_range_table (rf )
533+ # trie_lookup_table is used in all the property modules below
534+ emit_trie_lookup_range_table (rf )
535+ # emit_bsearch_range_table(rf)
407536
408537 # category tables
409538 for (name , cat , pfuns ) in ("general_category" , gencats , ["N" , "Cc" ]), \
@@ -414,3 +543,4 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
414543 # normalizations and conversions module
415544 emit_norm_module (rf , canon_decomp , compat_decomp , combines , norm_props )
416545 emit_conversions_module (rf , to_upper , to_lower , to_title )
546+ #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)
0 commit comments