5151 'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
5252}
5353
54+
55+ # Grapheme cluster data
56+ # taken from UAX29, http://www.unicode.org/reports/tr29/
57+ # these code points are excluded from the Control category
58+ # NOTE: CR and LF are also technically excluded, but for
59+ # the sake of convenience we leave them in the Control group
60+ # and manually check them in the appropriate place. This is
61+ # still compliant with the implementation requirements.
62+ grapheme_control_exceptions = set ([0x200c , 0x200d ])
63+
64+ # the Regional_Indicator category
65+ grapheme_regional_indicator = [(0x1f1e6 , 0x1f1ff )]
66+
67+ # "The following ... are specifically excluded" from the SpacingMark category
68+ # http://www.unicode.org/reports/tr29/#SpacingMark
69+ grapheme_spacingmark_exceptions = [(0x102b , 0x102c ), (0x1038 , 0x1038 ),
70+ (0x1062 , 0x1064 ), (0x1067 , 0x106d ), (0x1083 , 0x1083 ), (0x1087 , 0x108c ),
71+ (0x108f , 0x108f ), (0x109a , 0x109c ), (0x19b0 , 0x19b4 ), (0x19b8 , 0x19b9 ),
72+ (0x19bb , 0x19c0 ), (0x19c8 , 0x19c9 ), (0x1a61 , 0x1a61 ), (0x1a63 , 0x1a64 ),
73+ (0xaa7b , 0xaa7b ), (0xaa7d , 0xaa7d )]
74+
75+ # these are included in the SpacingMark category
76+ grapheme_spacingmark_extra = set ([0xe33 , 0xeb3 ])
77+
5478def fetch (f ):
5579 if not os .path .exists (f ):
5680 os .system ("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -109,7 +133,7 @@ def load_unicode_data(f):
109133 canon_decomp [code ] = seq
110134
111135 # place letter in categories as appropriate
112- for cat in [gencat ] + expanded_categories .get (gencat , []):
136+ for cat in [gencat , "Assigned" ] + expanded_categories .get (gencat , []):
113137 if cat not in gencats :
114138 gencats [cat ] = []
115139 gencats [cat ].append (code )
@@ -120,6 +144,12 @@ def load_unicode_data(f):
120144 combines [combine ] = []
121145 combines [combine ].append (code )
122146
147+ # generate Not_Assigned from Assigned
148+ gencats ["Cn" ] = gen_unassigned (gencats ["Assigned" ])
149+ # Assigned is not a real category
150+ del (gencats ["Assigned" ])
151+ # Other contains Not_Assigned
152+ gencats ["C" ].extend (gencats ["Cn" ])
123153 gencats = group_cats (gencats )
124154 combines = to_combines (group_cats (combines ))
125155
@@ -155,6 +185,11 @@ def ungroup_cat(cat):
155185 lo += 1
156186 return cat_out
157187
188+ def gen_unassigned (assigned ):
189+ assigned = set (assigned )
190+ return ([i for i in range (0 , 0xd800 ) if i not in assigned ] +
191+ [i for i in range (0xe000 , 0x110000 ) if i not in assigned ])
192+
158193def to_combines (combs ):
159194 combs_out = []
160195 for comb in combs :
@@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower):
350385 sorted (lowerupper .iteritems (), key = operator .itemgetter (0 )), is_pub = False )
351386 f .write ("}\n \n " )
352387
388+ def emit_grapheme_module (f , grapheme_table , grapheme_cats ):
389+ f .write ("""pub mod grapheme {
390+ use core::option::{Some, None};
391+ use core::slice::ImmutableVector;
392+
393+ #[allow(non_camel_case_types)]
394+ #[deriving(Clone)]
395+ pub enum GraphemeCat {
396+ """ )
397+ for cat in grapheme_cats + ["Any" ]:
398+ f .write (" GC_" + cat + ",\n " )
399+ f .write (""" }
400+
401+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
402+ use core::cmp::{Equal, Less, Greater};
403+ match r.bsearch(|&(lo, hi, _)| {
404+ if lo <= c && c <= hi { Equal }
405+ else if hi < c { Less }
406+ else { Greater }
407+ }) {
408+ Some(idx) => {
409+ let (_, _, cat) = r[idx];
410+ cat
411+ }
412+ None => GC_Any
413+ }
414+ }
415+
416+ pub fn grapheme_category(c: char) -> GraphemeCat {
417+ bsearch_range_value_table(c, grapheme_cat_table)
418+ }
419+
420+ """ )
421+
422+ emit_table (f , "grapheme_cat_table" , grapheme_table , "&'static [(char, char, GraphemeCat)]" ,
423+ pfun = lambda x : "(%s,%s,GC_%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]),
424+ is_pub = False )
425+ f .write ("}\n " )
426+
353427def emit_charwidth_module (f , width_table ):
354428 f .write ("pub mod charwidth {\n " )
355429 f .write (" use core::option::{Option, Some, None};\n " )
@@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table):
388462 f .write (" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n " )
389463 emit_table (f , "charwidth_table" , width_table , "&'static [(char, char, u8, u8)]" , is_pub = False ,
390464 pfun = lambda x : "(%s,%s,%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ], x [3 ]))
391- f .write ("}\n " )
465+ f .write ("}\n \n " )
392466
393467def emit_norm_module (f , canon , compat , combine ):
394468 canon_keys = canon .keys ()
@@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val):
473547 wtable_out .extend (wtable )
474548 return wtable_out
475549
550+
551+
476552def optimize_width_table (wtable ):
477553 wtable_out = []
478554 w_this = wtable .pop (0 )
@@ -487,7 +563,7 @@ def optimize_width_table(wtable):
487563 return wtable_out
488564
489565if __name__ == "__main__" :
490- r = "unicode .rs"
566+ r = "tables .rs"
491567 if os .path .exists (r ):
492568 os .remove (r )
493569 with open (r , "w" ) as rf :
@@ -498,12 +574,18 @@ def optimize_width_table(wtable):
498574 (canon_decomp , compat_decomp , gencats , combines ,
499575 lowerupper , upperlower ) = load_unicode_data ("UnicodeData.txt" )
500576 want_derived = ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ]
501- other_derived = ["Default_Ignorable_Code_Point" ]
577+ other_derived = ["Default_Ignorable_Code_Point" , "Grapheme_Extend" ]
502578 derived = load_properties ("DerivedCoreProperties.txt" , want_derived + other_derived )
503579 scripts = load_properties ("Scripts.txt" , [])
504580 props = load_properties ("PropList.txt" ,
505581 ["White_Space" , "Join_Control" , "Noncharacter_Code_Point" ])
506582
583+ # grapheme cluster category from DerivedCoreProperties
584+ # the rest are defined below
585+ grapheme_cats = {}
586+ grapheme_cats ["Extend" ] = derived ["Grapheme_Extend" ]
587+ del (derived ["Grapheme_Extend" ])
588+
507589 # bsearch_range_table is used in all the property modules below
508590 emit_bsearch_range_table (rf )
509591
@@ -533,7 +615,7 @@ def optimize_width_table(wtable):
533615 emit_norm_module (rf , canon_decomp , compat_decomp , combines )
534616 emit_conversions_module (rf , lowerupper , upperlower )
535617
536- # character width module
618+ ### character width module
537619 width_table = []
538620 for zwcat in ["Me" , "Mn" , "Cf" ]:
539621 width_table .extend (map (lambda (lo , hi ): (lo , hi , 0 , 0 ), gencats [zwcat ]))
@@ -555,3 +637,40 @@ def optimize_width_table(wtable):
555637 # optimize the width table by collapsing adjacent entities when possible
556638 width_table = optimize_width_table (width_table )
557639 emit_charwidth_module (rf , width_table )
640+
641+ ### grapheme cluster module
642+ # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
643+ # Hangul syllable categories
644+ want_hangul = ["L" , "V" , "T" , "LV" , "LVT" ]
645+ grapheme_cats .update (load_properties ("HangulSyllableType.txt" , want_hangul ))
646+
647+ # Control
648+ # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
649+ # Unicode Scalar Values only, and surrogates are thus invalid `char`s.
650+ grapheme_cats ["Control" ] = set ()
651+ for cat in ["Zl" , "Zp" , "Cc" , "Cf" ]:
652+ grapheme_cats ["Control" ] |= set (ungroup_cat (gencats [cat ]))
653+ grapheme_cats ["Control" ] = group_cat (list (
654+ grapheme_cats ["Control" ]
655+ - grapheme_control_exceptions
656+ | (set (ungroup_cat (gencats ["Cn" ]))
657+ & set (ungroup_cat (derived ["Default_Ignorable_Code_Point" ])))))
658+
659+ # Regional Indicator
660+ grapheme_cats ["RegionalIndicator" ] = grapheme_regional_indicator
661+
662+ # Prepend - "Currently there are no characters with this value"
663+ # (from UAX#29, Unicode 7.0)
664+
665+ # SpacingMark
666+ grapheme_cats ["SpacingMark" ] = group_cat (list (
667+ set (ungroup_cat (gencats ["Mc" ]))
668+ - set (ungroup_cat (grapheme_cats ["Extend" ]))
669+ | grapheme_spacingmark_extra
670+ - set (ungroup_cat (grapheme_spacingmark_exceptions ))))
671+
672+ grapheme_table = []
673+ for cat in grapheme_cats :
674+ grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
675+ grapheme_table .sort (key = lambda w : w [0 ])
676+ emit_grapheme_module (rf , grapheme_table , grapheme_cats .keys ())
0 commit comments