diff --git a/babel/numbers.py b/babel/numbers.py index 6e15fd3a8..12e82d219 100644 --- a/babel/numbers.py +++ b/babel/numbers.py @@ -24,6 +24,7 @@ import warnings from babel.core import default_locale, Locale, get_global +from babel.rbnf import RuleBasedNumberFormat try: # Python 2 @@ -662,6 +663,26 @@ def __init__(self, message, suggestions=None): self.suggestions = suggestions +def spell_number(number, locale=LC_NUMERIC, ruleset=None): + """Return value spelled out for a specific locale + + :param number: the number to format + :param locale: the `Locale` object or locale identifier + :param ruleset: the ruleset to use; defaults to regular numbers. + """ + speller = RuleBasedNumberFormat.negotiate(locale) + return speller.format(number, ruleset=ruleset) + + +def get_rbnf_rules(locale=LC_NUMERIC): + """Return all the available public rules for a specific locale + + :param locale: the `Locale` object or locale identifier + """ + speller = RuleBasedNumberFormat.negotiate(locale) + return speller.available_rulesets + + def parse_number(string, locale=LC_NUMERIC): """Parse localized number string into an integer. diff --git a/babel/rbnf.py b/babel/rbnf.py new file mode 100644 index 000000000..77f771240 --- /dev/null +++ b/babel/rbnf.py @@ -0,0 +1,704 @@ +""" +babel.rbnf +~~~~~~~~~~ + +Locale dependent spelling of numbers. + +Documentation: +- http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting +- http://www.icu-project.org/apiref/icu4c/classRuleBasedNumberFormat.html + +Examples +- http://userguide.icu-project.org/formatparse/numbers/rbnf-examples +- http://source.icu-project.org/repos/icu/trunk/icu4j/demos/src/com/ibm/icu/dev/demo/rbnf/RbnfSampleRuleSets.java + + +""" +# Dev notes +# +# Reloading cldr: +# python ./scripts/import_cldr.py ./cldr/cldr-core-35.1/common/ -f +# +# Tokenization is inspired by Ka-Ping Yee's tokenize library + +# Undocumented syntax (←%rule-name←←) +# Trac ticket filed for CLDR update PL rbnf +# http://unicode.org/cldr/trac/ticket/10544 +# Maybe the syntax need to be supported: +# http://bugs.icu-project.org/trac/ticket/13264 +# Original request for Hebrew (currently not used in Hebrew): +# http://bugs.icu-project.org/trac/ticket/4039 + + +import re +import math +import decimal +import collections +import warnings + +from babel.core import Locale, get_global + +TEXT_TOKEN = 1 +INTEGRAL_TOKEN = 2 +REMAINDER_TOKEN = 3 +PREVIOUS_TOKEN = 4 +SUBSTITUTION_TOKEN = 5 +PLURAL_TOKEN = 6 +OPT_START = 7 +OPT_END = 8 + +token_regexes = [ + (t, re.compile(r)) + for (t, r) in [ + (PLURAL_TOKEN, r"\$\((.+)\)\$"), + (INTEGRAL_TOKEN, r"←([^←[]*)←(←?)"), + (PREVIOUS_TOKEN, r"→→→"), + (REMAINDER_TOKEN, r"→([^→[]*)→"), + (SUBSTITUTION_TOKEN, r"=([^=[]+)="), + (OPT_START, r"\["), + (OPT_END, r"\]"), + (TEXT_TOKEN, r"[^[\]=→←]+"), + ] +] + +INTERNAL_REF = 1 +PRIVATE_REF = 2 +PUBLIC_REF = 3 +PLURAL_REF = 4 +DECIMAL_REF = 5 + +REFERENCE_TOKENS = (INTEGRAL_TOKEN, REMAINDER_TOKEN, SUBSTITUTION_TOKEN) + +NEGATIVE_NUMBER_RULE = '-x' +IMPROPER_FRACTION_RULE = 'x.x' +PROPER_FRACTION_RULE = '0.x' +MASTER_RULE = 'x.0' +INFINITY_RULE = 'Inf' +NOT_A_NUMBER_RULE = 'NaN' +SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR + + +# locale.number_symbols['decimal'] +# normal rule means a number is specified + + +class RBNFError(Exception): + pass + + +class TokenizationError(RBNFError): + pass + + +class RulesetNotFound(RBNFError): + pass + + +class RuleNotFound(RBNFError): + pass + + +class RulesetSubstitutionWarning(UserWarning): + pass + + +TokenInfo = collections.namedtuple('TokenInfo', 'type reference optional') + + +def tokenize(text): + """ + Each rule has a list of tokens + + Text parsed by matching a list of regular expressions + against the beginning of the text. If the regex match + a token is generated, and we continue with the rest of + the text. + + Some tokens are optional if they are in squared + brackets. From regular expressions for the beginning and + end of the optional section no tokens are generated. + Instead, all the tokens inside the optional section are + flagged as optional. + + Some of the tokens are referencing other rulesets by name. + This information is stored in the token along with the type + of reference. + + """ + # remove unnecessary syntax (only used in the non-xml form) + if text.endswith(";"): + text = text[:-1] + if text.startswith("'"): + text = text[1:] + + optional = False + + while text: + stop = True + # print("TEXT: ", text) + for tok, regex in token_regexes: + # print(token, regex) + match = regex.match(text) + if match: + stop = False + text = text[match.end():] + if tok == OPT_START: + optional = True + elif tok == OPT_END: + optional = False + else: + token = _gen_token(tok, match, optional) + if token: + yield token + break # always start searching with the first regex + if stop: + raise ValueError(text) + + +def _gen_token(tok, match, optional): + # remove this if CLDR is updated based on ticket + # http://unicode.org/cldr/trac/ticket/10544 + if tok == INTEGRAL_TOKEN and match.group(2) == '←': + warnings.warn('Unsupported syntax ←...←←', SyntaxWarning) + + if tok in REFERENCE_TOKENS: + reference = _parse_reference(match.group(1)) + return TokenInfo(tok, reference, optional) + + # currently only `en` has this + if tok == PLURAL_TOKEN: + return TokenInfo(tok, (PLURAL_REF, match.group(1)), optional) + + if tok == PREVIOUS_TOKEN: + return TokenInfo(tok, None, optional) + + if tok == TEXT_TOKEN: + return TokenInfo(tok, match.group(0), optional) + + +def _parse_reference(string): + if string == "": + return INTERNAL_REF, "" + if string.startswith('%%'): + return PRIVATE_REF, string[2:] + if string.startswith('%'): + return PUBLIC_REF, string[1:] + if string[0] in '0#': + return DECIMAL_REF, string + warnings.warn(f'Reference parsing error: {string}', SyntaxWarning) + return INTERNAL_REF, "" # defaults to this + + +def compute_divisor(value, radix): + # Compute the highest exponent of radix less than or equal to the rule's base + if isinstance(value, int): + if value == 0: + return 1 + exp = decimal.Decimal(value).ln() / decimal.Decimal(radix).ln() + return int(radix ** math.floor(exp)) + else: + return None + + +class RuleBasedNumberFormat: + """ + RuleBasedNumberFormat's behavior consists of one or more rule sets + + The first ruleset in a locale is the default ruleset. + The substitution descriptor (i.e., the text between the token characters) + may take one of three forms: + :a rule set name: + Perform the mathematical operation on the number, and format the result + using the named rule set. + :a DecimalFormat pattern: + Perform the mathematical operation on the number, and format the + result using a DecimalFormat with the specified pattern. The + pattern must begin with 0 or #. + :nothing: + Perform the mathematical operation on the number, and format the + result using the rule set containing the current rule, except: + + - You can't have an empty substitution descriptor with + a == substitution. + - If you omit the substitution descriptor in a >> substitution + in a fraction rule, format the result one digit at a time + using the rule set containing the current rule. + - If you omit the substitution descriptor in a << substitution + in a rule in a fraction rule set, format the result using + the default rule set for this formatter. + """ + group_types = ('SpelloutRules', 'OrdinalRules', 'NumberingSystemRules') + + # spell number should go for Spelloutrules + # make interface for the other two groups + + def __init__(self, locale, group='SpelloutRules'): + self._locale = locale + self._group = group + self.rulesets = self._locale._data['rbnf_rules'][self._group] + + @property + def available_rulesets(self): + """list available public rulesets""" + return [r.name for r in self.rulesets if not r.private] + + def _find_matching_ruleset(self, prefix): + available_rulesets = self.available_rulesets + if prefix in available_rulesets: + return (prefix, True) + # Sorting here avoids use of more specific ("spellout-ordinal-sinokorean-count") + # rulesets when a shorter one might be available. + for ruleset in sorted(available_rulesets): + if ruleset.startswith(prefix): + return (ruleset, False) + return (None, False) + + def match_ruleset(self, ruleset): + """ + Try to find a matching ruleset given a ruleset name or alias ("year", "ordinal"). + """ + if ruleset == "year": + ruleset = "spellout-numbering-year" + elif ruleset == "ordinal": + ruleset, exact_match = self._find_matching_ruleset("spellout-ordinal") + if not ruleset: + raise RulesetNotFound(f"No ordinal ruleset is available for {self._locale}") + if not exact_match: + warnings.warn(f"Using non-specific ordinal ruleset {ruleset}", RulesetSubstitutionWarning) + ruleset_obj = self.get_ruleset(ruleset) + if not ruleset_obj: + raise RulesetNotFound( + f"Ruleset {ruleset!r} is not one of the ones available for " + f"{self._locale}: {self.available_rulesets!r}" + ) + return ruleset_obj + + def format(self, number, ruleset=None): + """Format a number (int/float/decimal) with spelling rules. + + Ruleset may be an actual ruleset name for the locale, + or one of the aliases "year" or "ordinal". + """ + if not ruleset: + ruleset = "spellout-numbering" + + return self.match_ruleset(ruleset).apply(number, self) + + def get_ruleset(self, name): + for r in self.rulesets: + if r.name == name: + return r + + @classmethod + def negotiate(cls, locale): + """ + Negotiate proper RBNF rules based on global data item `rbnf_locales` + Caching is not necessary the Locale object does that pretty well + """ + loc = Locale.negotiate([str(Locale.parse(locale))], get_global('rbnf_locales')) + if not loc: + raise RulesetNotFound(f"No RBNF rules available for {locale}") + return cls(loc) + + +class Ruleset: + """ + Each rule set consists of a name, a colon, and a list of rules. + (in the ICU syntax, CLDR differs because of XML) + + If the rule's rule descriptor is left out, the base value is one plus the + preceding rule's base value (or zero if this is the first rule in the list) + in a normal rule set. In a fraction rule set, the base value is the same as + the preceding rule's base value. + + A rule set may be either a regular rule set or a fraction rule set, depending + on whether it is used to format a number's integral part (or the whole number) + or a number's fractional part. Using a rule set to format a rule's fractional + part makes it a fraction rule set. + + Which rule is used to format a number is defined according to one of the + following algorithms: + + REGULAR (NON-FRACTION) PROCESSING + --------------------------------- + If the rule set is a regular rule set, do the following: + + MASTER_RULE + If the rule set includes a master rule (and the number was passed in as a + double), use the master rule. (If the number being formatted was passed + in as a long, the master rule is ignored.) + + NEGATIVE_NUMBER_RULE + If the number is negative, use the negative-number rule. + + IMPROPER_FRACTION_RULE + If the number has a fractional part and is greater than 1, use + the improper fraction rule. + + PROPER_FRACTION_RULE + If the number has a fractional part and is between 0 and 1, use + the proper fraction rule. + + Binary-search the rule list for the rule with the highest base value + less than or equal to the number. If that rule has two substitutions, + its base value is not an even multiple of its divisor, and the number + is an even multiple of the rule's divisor, use the rule that precedes + it in the rule list. Otherwise, use the rule itself. + + FRACTION PROCESSING + ------------------- + If the rule set is a fraction rule set, do the following: + + Ignore negative-number and fraction rules. + + For each rule in the list, multiply the number being formatted (which + will always be between 0 and 1) by the rule's base value. Keep track + of the distance between the result and the nearest integer. + + Use the rule that produced the result closest to zero in the above + calculation. In the event of a tie or a direct hit, use the first + matching rule encountered. (The idea here is to try each rule's base + value as a possible denominator of a fraction. Whichever denominator + produces the fraction closest in value to the number being formatted + wins.) + + If the rule following the matching rule has the same base value, + use it if the numerator of the fraction is anything other than 1; if + the numerator is 1, use the original matching rule. (This is to allow + singular and plural forms of the rule text without a lot of extra hassle.) + + ---- + + A rule's body consists of a string of characters terminated by a semicolon. + The rule may include zero, one, or two substitution tokens, and a range of + text in brackets. The brackets denote optional text (and may also include + one or both substitutions). The exact meanings of the substitution tokens, + and under what conditions optional text is omitted, depend on the syntax + of the substitution token and the context. The rest of the text in a rule + body is literal text that is output when the rule matches the number + being formatted. + + A substitution token begins and ends with a token character. The token + character and the context together specify a mathematical operation to + be performed on the number being formatted. An optional substitution + descriptor specifies how the value resulting from that operation is + used to fill in the substitution. The position of the substitution + token in the rule body specifies the location of the resultant text + in the original rule text. + + The meanings of the substitution token characters are as follows: + + →→ REMAINDER_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the remainder + :in negative-number rule: + Find the absolute value of the number and format the result + :in fraction or master rule: + Isolate the number's fractional part and format it. + :in rule in fraction rule set: + Not allowed. + + →→→ PREVIOUS_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the + remainder, but bypass the normal rule-selection process + and just use the rule that precedes this one in this + rule list. + :in all other rules: + Not allowed. + + ←← INTEGRAL_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the quotient + :in negative-number rule: + Not allowed. + :in fraction or master rule: + Isolate the number's integral part and format it. + :in rule in fraction rule set: + Multiply the number by the rule's base value and format the result. + + == SUBSTITUTION_TOKEN + :in all rule sets: + Format the number unchanged + + [] OPT_START, OPT_END + :in normal rule: + Omit the optional text if the number is an even + multiple of the rule's divisor + :in negative-number rule: + Not allowed. + :in improper-fraction rule: + Omit the optional text if the number is between 0 and 1 + (same as specifying both an x.x rule and a 0.x rule) + :in master rule: + Omit the optional text if the number is an integer + (same as specifying both an x.x rule and an x.0 rule) + !!! contradicts the above as it says the master rule is ignored + :in proper-fraction rule: + Not allowed. + :in rule in fraction rule set: + Omit the optional text if multiplying the number by the + rule's base value yields 1. + + $(cardinal,plural syntax)$ PLURAL_TOKEN + :in all rule sets: + This provides the ability to choose a word based on the + number divided by the radix to the power of the exponent + of the base value for the specified locale, which is + normally equivalent to the ←← value. This uses the cardinal + plural rules from PluralFormat. All strings used in the + plural format are treated as the same base value for parsing. + + $(ordinal,plural syntax)$ PLURAL_TOKEN + :in all rule sets: + This provides the ability to choose a word based on the + number divided by the radix to the power of the exponent + of the base value for the specified locale, which is + normally equivalent to the ←← value. This uses the ordinal + plural rules from PluralFormat. All strings used in the + plural format are treated as the same base value for parsing. + + INFINITY_RULE = 'Inf' + + NOT_A_NUMBER_RULE = 'NaN' + + SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR + """ + + def __init__(self, name, private=False): + self.name = name + self.private = private + self.rules = [] + + def apply(self, number, parent, fractional=False): + number = decimal.Decimal(str(number)) + # str is needed to avoid unnecessary precision + # decimal is necessary for exact representation in fraction rules + + context = { + 'search_at': parent, + 'ruleset': self, + 'fractional': fractional, + 'omit_optional': False, # no default value is defined in the spec + SUBSTITUTION_TOKEN: number, + 'remainder_as_fractional': False # format remainder as fractional rule? + } + integral, remainder = divmod(number, 1) + + # fractional rule (ruleset in fractional processing) + # the value should always be between 0 and 1 + # not yet tested it needs clarification + if fractional: + index = self.get_rule_fractional(remainder) + if index is None: + raise RuleNotFound(f"rule for fractional processing of {remainder}") + rule = self.rules[index] + context[INTEGRAL_TOKEN] = rule.value * remainder # here remainder == number + context['omit_optional'] = rule.value * number == 1 + return rule.apply(number, context) + + # negative number rule + if number < 0: + rule = self.get_rule_special(NEGATIVE_NUMBER_RULE) + if rule is None: + raise RuleNotFound(f"negative number rule ({NEGATIVE_NUMBER_RULE})") + context[REMAINDER_TOKEN] = abs(number) + return rule.apply(number, context) + + # master and fraction rules + if remainder != 0: + context[REMAINDER_TOKEN] = number - integral + context[INTEGRAL_TOKEN] = integral + context['remainder_as_fractional'] = True + + # search for master rule + rule = self.get_rule_special(MASTER_RULE, strict=True) + + # no master rule found + if rule is None: + if integral == 0: + rule = self.get_rule_special(PROPER_FRACTION_RULE) + if rule is None: + raise RuleNotFound(f"proper fraction rule ({PROPER_FRACTION_RULE})") + + else: + rule = self.get_rule_special(IMPROPER_FRACTION_RULE) + if rule is None: + raise RuleNotFound(f"improper fraction rule ({IMPROPER_FRACTION_RULE})") + context['omit_optional'] = 0 < number < 1 # between 0 and 1 + + return rule.apply(number, context) + + # normal rule + index = self.get_rule_integral(integral) + if index is None: + raise RuleNotFound(f"normal rule for {integral}") + rule = self.rules[index] + i, r = divmod(integral, rule.divisor) + context[REMAINDER_TOKEN] = r + context[INTEGRAL_TOKEN] = i + context[PREVIOUS_TOKEN] = index - 1 # get rule using ruleset + context['omit_optional'] = r != 0 # only if not even multiple (TODO no need to store separately) + return rule.apply(number, context) + + def get_rule_special(self, val, strict=False): + if val in Rule.specials: + for r in self.rules: + if r.value == val: + return r + + # return last rule if no match occurred and strict is false + if not strict: + return self.rules[-1] + + def get_rule_integral(self, val): + """ + Binary-search the rule list for the rule with the highest base value + less than or equal to the number. + + If that rule has two substitutions, + its base value is not an even multiple of its divisor, and the number + is an even multiple of the rule's divisor, use the rule that precedes + it in the rule list. Otherwise, use the rule itself. + """ + # automatically return last rule if no range matched + ret = len(self.rules) - 1 + + for i in range(len(self.rules) - 1): + if self.rules[i].value in Rule.specials: + continue + + if self.rules[i].value <= val < self.rules[i + 1].value: + ret = i + break + + # need to have at least one normal rule? (otherwise ret could be None) + rule = self.rules[ret] + if rule.substitutions == 2 and \ + rule.value % rule.divisor == 0 and \ + val % rule.divisor == 0: + ret -= 1 + + return ret + + def get_rule_fractional(self, val): + """If the rule set is a fraction rule set, do the following: + + Ignore negative-number and fraction rules. + + For each rule in the list, multiply the number being formatted (which + will always be between 0 and 1) by the rule's base value. Keep track + of the distance between the result and the nearest integer. + + Use the rule that produced the result closest to zero in the above + calculation. In the event of a tie or a direct hit, use the first + matching rule encountered. (The idea here is to try each rule's base + value as a possible denominator of a fraction. Whichever denominator + produces the fraction closest in value to the number being formatted + wins.) + + If the rule following the matching rule has the same base value, + use it if the numerator of the fraction is anything other than 1; if + the numerator is 1, use the original matching rule. (This is to allow + singular and plural forms of the rule text without a lot of extra hassle.) + + ??? what is considered the numerator of what fraction here + ??? is it rather not the closeset integer + """ + dists = [] + for i, rule in enumerate(self.rules): + if rule.value in Rule.specials or rule.value == 0: # ignore specials and 0 rules + continue + d = abs(round(val * rule.value) - val * rule.value) + dists.append((i, d)) + + # get the index of the closest 0 match + bst = min(dists, key=lambda x: x[1])[0] + + # there is a following rule + if len(self.rules) > bst + 1 and \ + self.rules[bst].value == self.rules[bst + 1].value and \ + val * self.rules[bst].value > 1: + bst += 1 + + return bst + + def __repr__(self): + rules_str = '\n'.join(['\t' + str(r) for r in self.rules]) + return f'Ruleset {self.name} {self.private}\n{rules_str}\n' + + +class Rule: + """ + base value, a divisor, rule text, and zero, one, or two substitutions. + """ + specials = { + NEGATIVE_NUMBER_RULE, IMPROPER_FRACTION_RULE, + PROPER_FRACTION_RULE, MASTER_RULE, INFINITY_RULE, + NOT_A_NUMBER_RULE, SPECIAL_FRACTION_RULE, + } + + def __init__(self, value, text, radix=None): + """ + divisor : iterator of literal, back_sub, fwd_sub, lit_exact elements parsed from rule + """ + self.radix = int(radix or 10) + if value in self.specials: + self.value = value + else: + self.value = int(value) + + self.divisor = compute_divisor(self.value, self.radix) + self.text = text + self.tokens = list(tokenize(text)) + self.substitutions = len([t for t in self.tokens if t.type in REFERENCE_TOKENS]) + + def apply(self, number, context): + """ + """ + from .numbers import format_decimal + res = [] + for t in self.tokens: + if t.optional and not context['omit_optional']: + continue + + if t.type == TEXT_TOKEN: + res.append(t.reference) + + elif t.type in REFERENCE_TOKENS: + ref_type, ref = t.reference + ruleset = None + if ref_type == INTERNAL_REF: + ruleset = context['ruleset'] + elif ref_type in (PUBLIC_REF, PRIVATE_REF): # currently no distinction + ruleset = context['search_at'].get_ruleset(ref) + elif ref_type == DECIMAL_REF: + loc = context['search_at']._locale + res.append(format_decimal(number, format=ref, locale=loc)) + + if ruleset: + if t.type == REMAINDER_TOKEN and context['remainder_as_fractional']: + fractional = True + else: + fractional = context['fractional'] + res.append(ruleset.apply( + context[t.type], # number + context['search_at'], # parent + fractional, + )) + + elif t.type == PREVIOUS_TOKEN: + rule = context['ruleset'].rules[context[PREVIOUS_TOKEN]] + res.append(rule.apply( + context[REMAINDER_TOKEN], # number + context, # ??? + )) + + else: + raise ValueError(f'unknown token {t}', t) + + return ''.join(res) + + def __repr__(self): + tokens_str = '\n'.join(['\t\t' + str(t) for t in self.tokens]) + return f'Rule {self.value} ({self.text}) - {self.radix}\n{tokens_str}\n' diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index 5fda2deb4..d3e05940b 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -33,7 +33,7 @@ BABEL_PACKAGE_ROOT = os.path.join(CHECKOUT_ROOT, "babel") sys.path.insert(0, CHECKOUT_ROOT) -from babel import dates, numbers +from babel import dates, numbers, rbnf from babel.dates import split_interval_pattern from babel.localedata import Alias from babel.plural import PluralRule @@ -152,6 +152,8 @@ def _compact_dict(dict): def debug_repr(obj): if isinstance(obj, PluralRule): return obj.abstract + if isinstance(obj, (rbnf.Ruleset, rbnf.Rule)): + return vars(obj) return repr(obj) @@ -328,6 +330,14 @@ def parse_global(srcdir, sup): 'official_status': language.attrib.get('officialStatus'), } territory_languages[territory.attrib['type']] = languages + + # To help the negotiation in `babel.numbers.spell_number` + # add all locales with rbnf rules to a list under `rbnf_locales` + filenames = os.listdir(os.path.join(srcdir, 'rbnf')) + filenames.remove('root.xml') + # TODO parse root.xml for global data (how to fall back?) + global_data['rbnf_locales'] = [os.path.splitext(f)[0] for f in filenames] + return global_data @@ -443,6 +453,13 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False): unsupported_number_systems_string, )) + # there will be no rbnf rules for all locales + # there could be a separate iteration for rbnf rule files + rbnf_filename = os.path.join(srcdir, 'rbnf', filename) + if os.path.isfile(rbnf_filename): + rbnf_tree = parse(rbnf_filename) + parse_rbnf_rules(data, rbnf_tree) + write_datafile(data_filename, data, dump_json=dump_json) @@ -981,6 +998,38 @@ def parse_measurement_systems(data, tree): _import_type_text(measurement_systems, measurement_system, type=type) +def parse_rbnf_rules(data, tree): + """ + Parse rules based on: + http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting + """ + rbnf_rules = data.setdefault('rbnf_rules', {}) + + # ElementTree.dump(tree) + + for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'): + group_name = ruleset_grouping.attrib['type'] + rbnf_rules[group_name] = [] # TODO check for overwrite + for ruleset in ruleset_grouping.findall('ruleset'): + ruleset_name = ruleset.attrib['type'] + private = ruleset.attrib.get('access') == 'private' + ruleset_obj = rbnf.Ruleset(ruleset_name, private) + for rule in ruleset.findall('rbnfrule'): + radix = rule.attrib.get('radix') + if radix == "1,000": # HACK: work around misspelled radix in mt.xml + radix = "1000" + try: + rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix) + ruleset_obj.rules.append(rule_obj) + except rbnf.TokenizationError as e: + log('%s: Unable to parse rule "%s%s: %s "' % ( + data['locale_id'], + rule.attrib['value'], + rule.text, + '' if radix is None else ('/%s' % radix), + )) + rbnf_rules[group_name].append(ruleset_obj) + if __name__ == '__main__': main() diff --git a/setup.cfg b/setup.cfg index 12585f0d7..c7f0a0644 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,7 @@ norecursedirs = venv* .* _* scripts {args} doctest_optionflags = ELLIPSIS NORMALIZE_WHITESPACE ALLOW_UNICODE IGNORE_EXCEPTION_DETAIL markers = all_locales: parameterize test with all locales + all_rbnf_locales: parameterize test with all locales with RBNF rules [bdist_wheel] universal = 1 diff --git a/tests/conftest.py b/tests/conftest.py index 5b14b1ca7..86c0adf45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,3 +16,7 @@ def pytest_generate_tests(metafunc): from babel.localedata import locale_identifiers metafunc.parametrize("locale", list(locale_identifiers())) break + if mark.name == "all_rbnf_locales": + from babel.core import get_global + metafunc.parametrize("locale", list(get_global('rbnf_locales'))) + break diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py new file mode 100644 index 000000000..6c98ea6af --- /dev/null +++ b/tests/test_number_spelling.py @@ -0,0 +1,185 @@ +import unittest + +import pytest + +from babel import numbers +from babel import rbnf +from babel.localedata import locale_identifiers + +soft_hyphen = '\xad' + + +class TestRuleEngine(unittest.TestCase): + """ + Test everything related to the rules engine + """ + + def test_basic(self): + x = rbnf.RuleBasedNumberFormat.negotiate('hu_HU') + assert str(x._locale) == 'hu' + assert 'spellout-numbering' in x.available_rulesets + + def test_negotiation(self): + for lid in locale_identifiers(): + try: + loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale + except rbnf.RulesetNotFound: + # generate warning if necessary + continue + # test groups + for k in loc._data['rbnf_rules']: + assert k in rbnf.RuleBasedNumberFormat.group_types + + def test_tokenization(self): + + x = list(rbnf.tokenize("text[opt];")) + res = [ + rbnf.TokenInfo(type=1, reference='text', optional=False), + rbnf.TokenInfo(type=1, reference='opt', optional=True), + ] + assert x == res + + def test_xml_parsing(self): + """ + all the rules should be able to go through the parser and tokenizer + made up some rules and run the tokenizer on them + + TODO + read data from all the locales that have rbnf_rules defined + all the raw rules should be in a specific structure based + on the XML specification + """ + assert True + + +class TestSpelling(unittest.TestCase): + """ + Locale specific tests + """ + + def test_hu_HU_cardinal(self): + def _spell(x): + return numbers.spell_number(x, locale='hu_HU').replace(soft_hyphen, '') + + assert _spell(0) == "nulla" + assert _spell(1) == "egy" + assert _spell(2) == "kettő" + assert _spell(3) == "három" + assert _spell(10) == "tíz" + assert _spell(20) == "húsz" + # assert _spell('-0') == "mínusz nulla" + # assert _spell(123.25) == "százhuszonhárom egész huszonöt század" + assert _spell(-12) == "mínusz tizenkettő" + # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilenc" + assert _spell(1950) == "ezerkilencszázötven" + # only soft hyphens in the rules !!! + # assert _spell(2001) == "kétezer-egy" + # assert _spell('1999.2386') == "ezerkilencszázkilencvenkilenc egész kétezer-háromszáznyolcvanhat tízezred" + # assert _spell(-.199923862) == "mínusz nulla egész százkilencvenkilencezer-kilencszázhuszonnégy milliomod" + # assert _spell(-.199923862) == "kerekítve mínusz nulla egész ezerkilencszázkilencvenkilenc tízezred" + # assert _spell(.4326752) == "nulla egész negyvenhárom század" + + def test_hu_HU_ordinal(self): + def _spell(x): + return numbers.spell_number(x, locale='hu_HU', ruleset="ordinal").replace(soft_hyphen, '') + + assert _spell(0) == "nulla" + # assert _spell(0) == "nulladik" + assert _spell(1) == "első" + assert _spell(2) == "második" + assert _spell(3) == "harmadik" + assert _spell(10) == "tizedik" + assert _spell(20) == "huszadik" + assert _spell(30) == "harmincadik" + assert _spell(-12) == "mínusz tizenkettedik" + # assert _spell(23457829) == "huszonhárommilliónégyszázötvenhétezernyolcszázhuszonkilencedik" # wrong mutiple cldr errors + # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilencedik" + assert _spell(1100) == "ezerszázadik" + assert _spell(1950) == "ezerkilencszázötvenedik" + # assert _spell(2001) == "kétezer-egyedik" + + def test_en_GB_cardinal(self): + def _spell(x): + return numbers.spell_number(x, locale='en_GB').replace(soft_hyphen, '') + + assert _spell(0) == "zero" + assert _spell(1) == "one" + assert _spell(2) == "two" + assert _spell(3) == "three" + # assert _spell('-0') == "minus zero" + # assert _spell(123.25) == "one hundred and twenty-three point twenty-five hundredths" + assert _spell(-12) == "minus twelve" + assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-nine" + # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-nine" + assert _spell(1950) == "one thousand nine hundred fifty" + # assert _spell(1950) == "one thousand nine hundred and fifty" + assert _spell(2001) == "two thousand one" + # assert _spell('1999.238') == "one thousand nine hundred and ninety-nine point two hundred and thirty-eight thousandths" + # assert _spell(-.199923862, precision=3, state_rounded=True) == "approximately minus zero point two tenths" + # assert _spell(-.1) == "minus zero point one tenth" # float to string conversion preserves precision + + def test_en_GB_ordinal(self): + def _spell(x): + return numbers.spell_number(x, locale='en_GB', ruleset="ordinal").replace(soft_hyphen, '') + + assert _spell(0) == "zeroth" + assert _spell(1) == "first" + assert _spell(2) == "second" + assert _spell(3) == "third" + assert _spell(4) == "fourth" + assert _spell(5) == "fifth" + assert _spell(6) == "sixth" + assert _spell(7) == "seventh" + assert _spell(8) == "eighth" + assert _spell(9) == "ninth" + assert _spell(10) == "tenth" + assert _spell(11) == "eleventh" + assert _spell(12) == "twelfth" + assert _spell(13) == "thirteenth" + assert _spell(20) == "twentieth" + assert _spell(30) == "thirtieth" + assert _spell(40) == "fortieth" + # assert _spell(40) == "fourtieth" + assert _spell(-12) == "minus twelfth" + # assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-ninth" # apostrophes + # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-ninth" + assert _spell(1950) == "one thousand nine hundred fiftieth" + # assert _spell(1950) == "one thousand nine hundred and fiftieth" + assert _spell(2001) == "two thousand first" + + +@pytest.mark.all_rbnf_locales +@pytest.mark.parametrize('ruleset', (None, 'year', 'ordinal')) +def test_spelling_smoke(locale, ruleset): + try: + assert numbers.spell_number(2020, locale=locale, ruleset=ruleset) + except rbnf.RulesetNotFound: # Not all locales have all rulesets, so skip the smoke test. + pass + except RecursionError: # Some combinations currently fail with this :( + pytest.xfail(f'Locale {locale}, ruleset {ruleset}') + +# def test_hu_HU_error(): +# with pytest.raises(exceptions.TooBigToSpell) as excinfo: +# _spell(10**66, ordinal=True) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='hu_HU', precision=7) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752) + +# with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: +# _spell('1999.23862', ordinal=True) + +# def test_en_GB_error(): +# with pytest.raises(exceptions.TooBigToSpell) as excinfo: +# _spell(10**24, ordinal=True, locale='en_GB') + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='en_GB', precision=4) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='en_GB') + +# with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: +# _spell('1999.23', ordinal=True, locale='en_GB')