From 775851e80edab0195157fa03cfb5535ee3dde8e6 Mon Sep 17 00:00:00 2001 From: Szabolcs Date: Sun, 21 Jul 2019 20:41:03 +0200 Subject: [PATCH 1/9] Number spelling based on the CLDR's RBNF rules A pure Python engine for parsing RBNF rules. The rules are incomplete in many cases, fractional number spelling is hardly supported. Based on an earlier discussion: https://github.com/python-babel/babel/pull/114 and referenced in https://github.com/python-babel/babel/issues/179 --- babel/numbers.py | 21 + babel/rbnf.py | 713 ++++++++++++++++++++++++++++++++++ scripts/import_cldr.py | 48 ++- tests/test_number_spelling.py | 183 +++++++++ 4 files changed, 964 insertions(+), 1 deletion(-) create mode 100644 babel/rbnf.py create mode 100644 tests/test_number_spelling.py diff --git a/babel/numbers.py b/babel/numbers.py index 6e15fd3a8..d9a0c0462 100644 --- a/babel/numbers.py +++ b/babel/numbers.py @@ -24,6 +24,7 @@ import warnings from babel.core import default_locale, Locale, get_global +from babel.rbnf import RuleBasedNumberFormat try: # Python 2 @@ -662,6 +663,26 @@ def __init__(self, message, suggestions=None): self.suggestions = suggestions +def spell_number(number, locale=LC_NUMERIC, **kwargs): + """Return value spelled out for a specific locale + + :param number: the number to format + :param locale: the `Locale` object or locale identifier + :param kwargs: optional locale specific parameters + """ + speller = RuleBasedNumberFormat.negotiate(locale) + return speller.format(number, **kwargs) + + +def get_rbnf_rules(locale=LC_NUMERIC): + """Return all the available public rules for a specific locale + + :param locale: the `Locale` object or locale identifier + """ + speller = RuleBasedNumberFormat.negotiate(locale) + return speller.available_rulesets + + def parse_number(string, locale=LC_NUMERIC): """Parse localized number string into an integer. diff --git a/babel/rbnf.py b/babel/rbnf.py new file mode 100644 index 000000000..2a5d24275 --- /dev/null +++ b/babel/rbnf.py @@ -0,0 +1,713 @@ +# -*- coding: utf-8 -*- +""" +babel.rbnf +~~~~~~~~~~ + +Locale dependent spelling of numbers. + +Documentation: +- http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting +- http://www.icu-project.org/apiref/icu4c/classRuleBasedNumberFormat.html + +Examples +- http://userguide.icu-project.org/formatparse/numbers/rbnf-examples +- http://source.icu-project.org/repos/icu/trunk/icu4j/demos/src/com/ibm/icu/dev/demo/rbnf/RbnfSampleRuleSets.java + + +""" +# Dev notes +# +# Reloading cldr: +# python ./scripts/import_cldr.py ./cldr/cldr-core-35.1/common/ -f +# +# Tokenization is inspired by Ka-Ping Yee's tokenize library + +# Undocumented syntax (←%rule-name←←) +# Trac ticket filed for CLDR update PL rbnf +# http://unicode.org/cldr/trac/ticket/10544 +# Maybe the syntax need to be supported: +# http://bugs.icu-project.org/trac/ticket/13264 +# Original request for Hebrew (currently not used in Hebrew): +# http://bugs.icu-project.org/trac/ticket/4039 + +from __future__ import unicode_literals + +import re +import sys +import math +import decimal +import collections +import warnings + +from babel.core import Locale, get_global + +TEXT_TOKEN = 1 +INTEGRAL_TOKEN = 2 +REMAINDER_TOKEN = 3 +PREVIOUS_TOKEN = 4 +SUBSTITUTION_TOKEN = 5 +PLURAL_TOKEN = 6 +OPT_START = 7 +OPT_END = 8 + +regex = [ + (PLURAL_TOKEN, r"\$\((.+)\)\$"), + (INTEGRAL_TOKEN, r"←([^←[]*)←(←?)"), + (PREVIOUS_TOKEN, r"→→→"), + (REMAINDER_TOKEN, r"→([^→[]*)→"), + (SUBSTITUTION_TOKEN, r"=([^=[]+)="), + (OPT_START, r"\["), + (OPT_END, r"\]"), + (TEXT_TOKEN, r"[^[\]=→←]+"), +] + +INTERNAL_REF = 1 +PRIVATE_REF = 2 +PUBLIC_REF = 3 +PLURAL_REF = 4 +DECIMAL_REF = 5 + +REFERENCE_TOKENS = (INTEGRAL_TOKEN, REMAINDER_TOKEN, SUBSTITUTION_TOKEN) + +NEGATIVE_NUMBER_RULE = '-x' +IMPROPER_FRACTION_RULE = 'x.x' +PROPER_FRACTION_RULE = '0.x' +MASTER_RULE = 'x.0' +INFINITY_RULE = 'Inf' +NOT_A_NUMBER_RULE = 'NaN' +SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR +# locale.number_symbols['decimal'] +# normal rule means a number is specified + + +class RBNFError(Exception): pass +class TokenizationError(RBNFError): pass +class RulesetNotFound(RBNFError): pass +class RuleNotFound(RBNFError): pass + +TokenInfo = collections.namedtuple('TokenInfo', 'type reference optional') + +# compile regex +regex_comp = [(t, re.compile(r)) for t, r in regex] + + +def tokenize(text): + """ + Each rule has a list of tokens + + Text parsed by matching a list of regular expressions + against the beginning of the text. If the regex match + a token is generated and we continue with the rest of + the text. + + Some of the tokens are optional if they are in squared + brackets. From regular expressions for the begining and + end of the optional section no tokens are generated. + Instead all the tokens inside the optional section are + flaged as optional. + + Some of them tokens are referencing other rulesets by name + this information is stored in the token along with the type + of reference. + + """ + # remove uneccesarry syntax (only used in the non-xml form) + if text.endswith(";"): text = text[:-1] + if text.startswith("'"): text = text[1:] + + optional = False + + while text: + stop = True + # print("TEXT: ", text) + for tok, regex in regex_comp: + # print(token, regex) + match = regex.match(text) + if match: + stop = False + text = text[match.end():] + if tok == OPT_START: optional = True + elif tok == OPT_END: optional = False + else: + token = _gen_token(tok, match, optional) + if token: + yield token + break # always start searching with the first regex + if stop: + raise ValueError(text) + + +def _gen_token(tok, match, optional): + # remove this if CLCR is updated based on ticket + # http://unicode.org/cldr/trac/ticket/10544 + if tok == INTEGRAL_TOKEN and match.group(2) == '←': + warnings.warn('Unsupported syntax ←...←←', SyntaxWarning) + + if tok in REFERENCE_TOKENS: + reference = _parse_reference(match.group(1)) + return TokenInfo(tok, reference, optional) + + # currently only `en` has this + if tok == PLURAL_TOKEN: + return TokenInfo(tok, (PLURAL_REF, match.group(1)), optional) + + if tok == PREVIOUS_TOKEN: + return TokenInfo(tok, None, optional) + + if tok == TEXT_TOKEN: + return TokenInfo(tok, match.group(0), optional) + + +def _parse_reference(string): + if string == "": + return INTERNAL_REF, "" + if string.startswith('%%'): + return PRIVATE_REF, string[2:] + if string.startswith('%'): + return PUBLIC_REF, string[1:] + if string[0] in '0#': + return DECIMAL_REF, string + warnings.warn('Reference parsing error: %s' % string, SyntaxWarning) + return INTERNAL_REF, "" # defaults to this + + +def untokenize_ICU(): + """ + TODO implement ICU style representation + rather make Ruleset.format_icu() + """ + + +class RuleBasedNumberFormat(object): + """ + RuleBasedNumberFormat's behavior consists of one or more rule sets + + The first ruleset in a locale is the default ruleset. + The substitution descriptor (i.e., the text between the token characters) + may take one of three forms: + :a rule set name: + Perform the mathematical operation on the number, and format the result + using the named rule set. + :a DecimalFormat pattern: + Perform the mathematical operation on the number, and format the + result using a DecimalFormat with the specified pattern. The + pattern must begin with 0 or #. + :nothing: + Perform the mathematical operation on the number, and format the + result using the rule set containing the current rule, except: + + - You can't have an empty substitution descriptor with + a == substitution. + - If you omit the substitution descriptor in a >> substitution + in a fraction rule, format the result one digit at a time + using the rule set containing the current rule. + - If you omit the substitution descriptor in a << substitution + in a rule in a fraction rule set, format the result using + the default rule set for this formatter. + """ + group_types = ('SpelloutRules', 'OrdinalRules', 'NumberingSystemRules') + # spell number should go for Spelloutrules + # make interface for the other two groups + + def __init__(self, locale, group='SpelloutRules'): + self._locale = locale + self._group = group + + @property + def rulesets(self): + return self._locale._data['rbnf_rules'][self._group] + + @property + def available_rulesets(self): + """list available public rulesets""" + return [r.name for r in self.rulesets if not r.private] + + + def format(self, number, ordinal=False, year=False, ruleset=None, **kwargs): + """spell an actual number (int/float/decimal) + + Search available_rulesets for an entry point + default is `spellout-numbering`. + + If year is True: use spellout-numbering-year + If ordinal is True: use spellout-ordinal + If year and ordinal both True: raise error + + TODO + If no `spellout-ordinal`: + if has `spellout-ordinal-*`: use first one, issue warning + + """ + if ordinal and year: + raise ValueError('both ordinal and year is not possible') + if ordinal: + search = ruleset or 'spellout-ordinal' + elif year: + search = ruleset or 'spellout-year' + else: + search = ruleset or 'spellout-numbering' + + ruleset = self.get_ruleset(search) + + if ruleset is None: + raise RulesetNotFound(search) + + return ruleset.apply(number, self) + + + def get_ruleset(self, name): + for r in self.rulesets: + if r.name == name: + return r + + + @classmethod + def negotiate(cls, locale): + """ + Negotiate proper RBNF rules based on global data item `rbnf_locales` + Caching is not necessary the Locale object does that pretty well + """ + loc = Locale.negotiate([str(Locale.parse(locale))], get_global('rbnf_locales')) + return cls(loc) + + +class Ruleset(object): + """ + Each rule set consists of a name, a colon, and a list of rules. + (in the ICU syntax, CLDR differs because of XML) + + If the rule's rule descriptor is left out, the base value is one plus the + preceding rule's base value (or zero if this is the first rule in the list) + in a normal rule set. In a fraction rule set, the base value is the same as + the preceding rule's base value. + + A rule set may be either a regular rule set or a fraction rule set, depending + on whether it is used to format a number's integral part (or the whole number) + or a number's fractional part. Using a rule set to format a rule's fractional + part makes it a fraction rule set. + + Which rule is used to format a number is defined according to one of the + following algorithms: + + REGULAR (NON-FRACTION) PROCESSING + --------------------------------- + If the rule set is a regular rule set, do the following: + + MASTER_RULE + If the rule set includes a master rule (and the number was passed in as a + double), use the master rule. (If the number being formatted was passed + in as a long, the master rule is ignored.) + + NEGATIVE_NUMBER_RULE + If the number is negative, use the negative-number rule. + + IMPROPER_FRACTION_RULE + If the number has a fractional part and is greater than 1, use + the improper fraction rule. + + PROPER_FRACTION_RULE + If the number has a fractional part and is between 0 and 1, use + the proper fraction rule. + + Binary-search the rule list for the rule with the highest base value + less than or equal to the number. If that rule has two substitutions, + its base value is not an even multiple of its divisor, and the number + is an even multiple of the rule's divisor, use the rule that precedes + it in the rule list. Otherwise, use the rule itself. + + FRACTION PROCESSING + ------------------- + If the rule set is a fraction rule set, do the following: + + Ignore negative-number and fraction rules. + + For each rule in the list, multiply the number being formatted (which + will always be between 0 and 1) by the rule's base value. Keep track + of the distance between the result and the nearest integer. + + Use the rule that produced the result closest to zero in the above + calculation. In the event of a tie or a direct hit, use the first + matching rule encountered. (The idea here is to try each rule's base + value as a possible denominator of a fraction. Whichever denominator + produces the fraction closest in value to the number being formatted + wins.) + + If the rule following the matching rule has the same base value, + use it if the numerator of the fraction is anything other than 1; if + the numerator is 1, use the original matching rule. (This is to allow + singular and plural forms of the rule text without a lot of extra hassle.) + + ---- + + A rule's body consists of a string of characters terminated by a semicolon. + The rule may include zero, one, or two substitution tokens, and a range of + text in brackets. The brackets denote optional text (and may also include + one or both substitutions). The exact meanings of the substitution tokens, + and under what conditions optional text is omitted, depend on the syntax + of the substitution token and the context. The rest of the text in a rule + body is literal text that is output when the rule matches the number + being formatted. + + A substitution token begins and ends with a token character. The token + character and the context together specify a mathematical operation to + be performed on the number being formatted. An optional substitution + descriptor specifies how the value resulting from that operation is + used to fill in the substitution. The position of the substitution + token in the rule body specifies the location of the resultant text + in the original rule text. + + The meanings of the substitution token characters are as follows: + + →→ REMAINDER_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the remainder + :in negative-number rule: + Find the absolute value of the number and format the result + :in fraction or master rule: + Isolate the number's fractional part and format it. + :in rule in fraction rule set: + Not allowed. + + →→→ PREVIOUS_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the + remainder, but bypass the normal rule-selection process + and just use the rule that precedes this one in this + rule list. + :in all other rules: + Not allowed. + + ←← INTEGRAL_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the quotient + :in negative-number rule: + Not allowed. + :in fraction or master rule: + Isolate the number's integral part and format it. + :in rule in fraction rule set: + Multiply the number by the rule's base value and format the result. + + == SUBSTITUTION_TOKEN + :in all rule sets: + Format the number unchanged + + [] OPT_START, OPT_END + :in normal rule: + Omit the optional text if the number is an even + multiple of the rule's divisor + :in negative-number rule: + Not allowed. + :in improper-fraction rule: + Omit the optional text if the number is between 0 and 1 + (same as specifying both an x.x rule and a 0.x rule) + :in master rule: + Omit the optional text if the number is an integer + (same as specifying both an x.x rule and an x.0 rule) + !!! contradicts the above as it says the master rule is ignored + :in proper-fraction rule: + Not allowed. + :in rule in fraction rule set: + Omit the optional text if multiplying the number by the + rule's base value yields 1. + + $(cardinal,plural syntax)$ PLURAL_TOKEN + :in all rule sets: + This provides the ability to choose a word based on the + number divided by the radix to the power of the exponent + of the base value for the specified locale, which is + normally equivalent to the ←← value. This uses the cardinal + plural rules from PluralFormat. All strings used in the + plural format are treated as the same base value for parsing. + + $(ordinal,plural syntax)$ PLURAL_TOKEN + :in all rule sets: + This provides the ability to choose a word based on the + number divided by the radix to the power of the exponent + of the base value for the specified locale, which is + normally equivalent to the ←← value. This uses the ordinal + plural rules from PluralFormat. All strings used in the + plural format are treated as the same base value for parsing. + + INFINITY_RULE = 'Inf' + + NOT_A_NUMBER_RULE = 'NaN' + + SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR + """ + def __init__(self, name, private=False): + self.name = name + self.private = private + self.rules = [] + + + def apply(self, number, parent, fractional=False): + number = decimal.Decimal(str(number)) + # str is needed to avoid unecessary precision + # decimal is necessary for exact representation in fraction rules + + context = { + 'search_at': parent, + 'ruleset': self, + 'fractional': fractional, + 'omit_optional': False, # no default value is defined in the spec + SUBSTITUTION_TOKEN: number, + 'remainder_as_fractional': False # format remainder as fractional rule? + } + integral, remainder = divmod(number, 1) + + # fractional rule (ruleset in fractional processing) + # the value should always be between 0 and 1 + # not yet tested it needs clarification + if fractional: + index = self.get_rule_fractional(remainder) + if index is None: + raise RuleNotFound("rule for fractional processing of %s" % remainder) + rule = self.rules[index] + context[INTEGRAL_TOKEN] = rule.value * remainder # here remainder == number + context['omit_optional'] = rule.value * number == 1 + return rule.apply(number, context) + + # negative number rule + if number < 0: + rule = self.get_rule_special(NEGATIVE_NUMBER_RULE) + if rule is None: + raise RuleNotFound("negative number rule (%s)" % NEGATIVE_NUMBER_RULE) + context[REMAINDER_TOKEN] = abs(number) + return rule.apply(number, context) + + # master and fraction rules + if remainder != 0: + context[REMAINDER_TOKEN] = number - integral + context[INTEGRAL_TOKEN] = integral + context['remainder_as_fractional'] = True + + # search for master rule + rule = self.get_rule_special(MASTER_RULE, strict=True) + + # no master rule found + if rule is None: + if integral == 0: + rule = self.get_rule_special(PROPER_FRACTION_RULE) + if rule is None: + raise RuleNotFound("proper fraction rule (%s)" % PROPER_FRACTION_RULE) + + else: + rule = self.get_rule_special(IMPROPER_FRACTION_RULE) + if rule is None: + raise RuleNotFound("improper fraction rule (%s)" % IMPROPER_FRACTION_RULE) + context['omit_optional'] = 0 < number < 1 # between 0 and 1 + + return rule.apply(number, context) + + # normal rule + index = self.get_rule_integral(integral) + if index is None: + raise RuleNotFound("normal rule for %s" % integral) + rule = self.rules[index] + i, r = divmod(integral, rule.divisor) + context[REMAINDER_TOKEN] = r + context[INTEGRAL_TOKEN] = i + context[PREVIOUS_TOKEN] = index-1 # get rule using ruleset + context['omit_optional'] = r != 0 # only if not even multiple (TODO no need to store separatelly) + return rule.apply(number, context) + + + def get_rule_special(self, val, strict=False): + if val in Rule.specials: + for r in self.rules: + if r.value == val: + return r + + # return last rule if no match occured and strict is false + if not strict: + return self.rules[-1] + + + def get_rule_integral(self, val): + """ + Binary-search the rule list for the rule with the highest base value + less than or equal to the number. + + If that rule has two substitutions, + its base value is not an even multiple of its divisor, and the number + is an even multiple of the rule's divisor, use the rule that precedes + it in the rule list. Otherwise, use the rule itself. + """ + # automatically return last rule if no range matched + ret = len(self.rules)-1 + + for i in range(len(self.rules)-1): + if self.rules[i].value in Rule.specials: + continue + + if self.rules[i].value <= val < self.rules[i+1].value: + ret = i + break + + # need to have at least one normal rule? (otherwise ret could be None) + rule = self.rules[ret] + if rule.substitutions == 2 and \ + rule.value % rule.divisor == 0 and \ + val % rule.divisor == 0: + ret -= 1 + + return ret + + + def get_rule_fractional(self, val): + """If the rule set is a fraction rule set, do the following: + + Ignore negative-number and fraction rules. + + For each rule in the list, multiply the number being formatted (which + will always be between 0 and 1) by the rule's base value. Keep track + of the distance between the result and the nearest integer. + + Use the rule that produced the result closest to zero in the above + calculation. In the event of a tie or a direct hit, use the first + matching rule encountered. (The idea here is to try each rule's base + value as a possible denominator of a fraction. Whichever denominator + produces the fraction closest in value to the number being formatted + wins.) + + If the rule following the matching rule has the same base value, + use it if the numerator of the fraction is anything other than 1; if + the numerator is 1, use the original matching rule. (This is to allow + singular and plural forms of the rule text without a lot of extra hassle.) + + ??? what is considered the numerator of what fraction here + ??? is it rather not the closeset integer + """ + dists = [] + for i, rule in enumerate(self.rules): + if rule.value in Rule.specials or rule.value == 0: # ignore specials and 0 rules + continue + d = abs(round(val*rule.value) - val*rule.value) + dists.append((i, d)) + + # get the index of the closest 0 match + bst = min(dists, key=lambda x: x[1])[0] + + # there is a following rule + if len(self.rules) > bst+1 and \ + self.rules[bst].value == self.rules[bst+1].value and \ + val*self.rules[bst].value > 1: + bst += 1 + + return bst + + + def __repr__(self): + return 'Ruleset %s %s\n%s\n' % (self.name, self.private, '\n'.join(['\t'+str(r) for r in self.rules])) + + +class Rule(object): + """ + base value, a divisor, rule text, and zero, one, or two substitutions. + """ + specials = ( + NEGATIVE_NUMBER_RULE, IMPROPER_FRACTION_RULE, + PROPER_FRACTION_RULE, MASTER_RULE, INFINITY_RULE, + NOT_A_NUMBER_RULE, SPECIAL_FRACTION_RULE, + ) + + + def __init__(self, value, text, radix=None): + """ + divisor : iterator of literal, back_sub, fwd_sub, lit_exact elements parsed from rule + """ + if value in self.specials: + self.value = value + else: + try: + self.value = int(value) + except: + warnings.warn("Unknown rule value: [%s]" % value, SyntaxWarning) + + self.text = text + self._radix = radix + + self._parse(text) + + + def apply(self, number, context): + """ + """ + from .numbers import format_decimal + res = [] + for t in self.tokens: + if t.optional and not context['omit_optional']: + continue + + if t.type == TEXT_TOKEN: + res.append(t.reference) + + elif t.type in REFERENCE_TOKENS: + ref_type, ref = t.reference + ruleset = None + if ref_type == INTERNAL_REF: + ruleset = context['ruleset'] + elif ref_type in (PUBLIC_REF, PRIVATE_REF): # currently no distinction + ruleset = context['search_at'].get_ruleset(ref) + elif ref_type == DECIMAL_REF: + loc = context['search_at']._locale + x = numbers.format_decimal(number, format=ref, locale=loc) + res.append(x) + + if ruleset: + if t.type == REMAINDER_TOKEN and context['remainder_as_fractional']: + fractional = True + else: + fractional = context['fractional'] + res.append(ruleset.apply( + context[t.type], # number + context['search_at'], # parent + fractional, + )) + + elif t.type == PREVIOUS_TOKEN: + rule = context['ruleset'].rules[context[PREVIOUS_TOKEN]] + res.append(rule.apply( + context[REMAINDER_TOKEN], # number + context, # ??? + )) + + else: + raise ValueError('unknown token %s', t) + + + return ''.join(res) + + + @property + def divisor(self): + """it is highest exponent of radix less then or equal to the rules's base""" + if isinstance(self.value, int): + if self.value == 0: + return 1 + exp = decimal.Decimal(self.value).ln()/decimal.Decimal(self.radix).ln() + return int(self.radix**math.floor(exp)) + + + @property + def radix(self): + return self._radix or 10 + + + @property + def substitutions(self): + return len([t for t in self.tokens if t.type in REFERENCE_TOKENS]) + + + def _parse(self, text): + try: + self.tokens = [t for t in tokenize(text)] + except ValueError: + raise TokenizationError(self.text) + + + def __repr__(self): + return 'Rule %s (%s) - %s\n%s\n' % ( + self.value, self.text, + self.radix, + '\n'.join(['\t\t'+str(t) for t in self.tokens])) diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index 5fda2deb4..531d8528b 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -33,7 +33,7 @@ BABEL_PACKAGE_ROOT = os.path.join(CHECKOUT_ROOT, "babel") sys.path.insert(0, CHECKOUT_ROOT) -from babel import dates, numbers +from babel import dates, numbers, rbnf from babel.dates import split_interval_pattern from babel.localedata import Alias from babel.plural import PluralRule @@ -225,6 +225,7 @@ def parse_global(srcdir, sup): all_currencies = collections.defaultdict(set) currency_fractions = global_data.setdefault('currency_fractions', {}) territory_languages = global_data.setdefault('territory_languages', {}) + rbnf_locales = global_data.setdefault('rbnf_locales', []) bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml')) sup_windows_zones = parse(os.path.join(sup_dir, 'windowsZones.xml')) sup_metadata = parse(os.path.join(sup_dir, 'supplementalMetadata.xml')) @@ -328,6 +329,14 @@ def parse_global(srcdir, sup): 'official_status': language.attrib.get('officialStatus'), } territory_languages[territory.attrib['type']] = languages + + # To help the negotiation in `babel.numbers.spell_number` + # add all locales with rbnf rules to a list under `rbnf_locales` + filenames = os.listdir(os.path.join(srcdir, 'rbnf')) + filenames.remove('root.xml') + # TODO parse root.xml for global data (how to fall back?) + global_data['rbnf_locales'] = [os.path.splitext(f)[0] for f in filenames] + return global_data @@ -443,6 +452,13 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False): unsupported_number_systems_string, )) + # there will be no rbnf rules for all locales + # there could be a separate iteration for rbnf rule files + rbnf_filename = os.path.join(srcdir, 'rbnf', filename) + if os.path.isfile(rbnf_filename): + rbnf_tree = parse(rbnf_filename) + parse_rbnf_rules(data, rbnf_tree) + write_datafile(data_filename, data, dump_json=dump_json) @@ -981,6 +997,36 @@ def parse_measurement_systems(data, tree): _import_type_text(measurement_systems, measurement_system, type=type) +def parse_rbnf_rules(data, tree): + """ + Parse rules based on: + http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting + """ + rbnf_rules = data.setdefault('rbnf_rules', {}) + + # ElementTree.dump(tree) + + for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'): + group_name = ruleset_grouping.attrib['type'] + rbnf_rules[group_name] = [] # TODO check for overwrite + for ruleset in ruleset_grouping.findall('ruleset'): + ruleset_name = ruleset.attrib['type'] + private = ruleset.attrib.get('access') == 'private' + ruleset_obj = rbnf.Ruleset(ruleset_name, private) + for rule in ruleset.findall('rbnfrule'): + radix = rule.attrib.get('radix') + try: + rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix) + except rbnf.TokenizationError as e: + log('%s: Unable to parse rule "%s%s: %s "' % ( + data['locale_id'], + rule.attrib['value'], + rule.text, + '' if radix is None else ('/%s' % radix), + )) + ruleset_obj.rules.append(rule_obj) + rbnf_rules[group_name].append(ruleset_obj) + if __name__ == '__main__': main() diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py new file mode 100644 index 000000000..bcfcf8644 --- /dev/null +++ b/tests/test_number_spelling.py @@ -0,0 +1,183 @@ +import unittest +import pytest + +from babel import numbers +from babel import rbnf +from babel.core import get_global +from babel.localedata import locale_identifiers + +soft_hyphen = '\xad' + +class TestRuleEngine(unittest.TestCase): + """ + Test everything related to the rules engine + """ + def test_basic(self): + x = rbnf.RuleBasedNumberFormat.negotiate('hu_HU') + assert str(x._locale) == 'hu' + assert 'spellout-numbering' in x.available_rulesets + + + def test_negotiation(self): + valid_ruleset_groups = ("SpelloutRules", "OrdinalRules", "NumberingSystemRules") + + for lid in locale_identifiers(): + loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale + if loc is None: + # generate warning if necessary + pass + else: + # test groups + for k in loc._data['rbnf_rules']: + assert k in valid_ruleset_groups + + + def test_tokenization(self): + + x = list(rbnf.tokenize("text[opt];")) + res = [ + rbnf.TokenInfo(type=1, reference='text', optional=False), + rbnf.TokenInfo(type=1, reference='opt', optional=True), + ] + assert x == res + + + def test_xml_parsing(self): + """ + all the rules should be able to go through the parser and tokenizer + made up some rules and run the tokenizer on them + + TODO + read data from all the locales that have rbnf_rules defined + all the raw rules should be in a specific structure based + on the XML specification + """ + assert True + + +class TestSpelling(unittest.TestCase): + """ + Locale specific tests + """ + def test_hu_HU_cardinal(self): + def _spell(x): + return numbers.spell_number(x, locale='hu_HU').replace(soft_hyphen, '') + + assert _spell(0) == "nulla" + assert _spell(1) == "egy" + assert _spell(2) == "kettő" + assert _spell(3) == "három" + assert _spell(10) == "tíz" + assert _spell(20) == "húsz" + # assert _spell('-0') == "mínusz nulla" + # assert _spell(123.25) == "százhuszonhárom egész huszonöt század" + assert _spell(-12) == "mínusz tizenkettő" + # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilenc" + assert _spell(1950) == "ezerkilencszázötven" + # only soft hyphens in the rules !!! + # assert _spell(2001) == "kétezer-egy" + # assert _spell('1999.2386') == "ezerkilencszázkilencvenkilenc egész kétezer-háromszáznyolcvanhat tízezred" + # assert _spell(-.199923862) == "mínusz nulla egész százkilencvenkilencezer-kilencszázhuszonnégy milliomod" + # assert _spell(-.199923862) == "kerekítve mínusz nulla egész ezerkilencszázkilencvenkilenc tízezred" + # assert _spell(.4326752) == "nulla egész negyvenhárom század" + + + def test_hu_HU_ordinal(self): + def _spell(x): + return numbers.spell_number(x, locale='hu_HU', ordinal=True).replace(soft_hyphen, '') + + assert _spell(0) == "nulla" + # assert _spell(0) == "nulladik" + assert _spell(1) == "első" + assert _spell(2) == "második" + assert _spell(3) == "harmadik" + assert _spell(10) == "tizedik" + assert _spell(20) == "huszadik" + assert _spell(30) == "harmincadik" + assert _spell(-12) == "mínusz tizenkettedik" + # assert _spell(23457829) == "huszonhárommilliónégyszázötvenhétezernyolcszázhuszonkilencedik" # wrong mutiple cldr errors + # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilencedik" + assert _spell(1100) == "ezerszázadik" + assert _spell(1950) == "ezerkilencszázötvenedik" + # assert _spell(2001) == "kétezer-egyedik" + + + def test_en_GB_cardinal(self): + def _spell(x): + return numbers.spell_number(x, locale='en_GB').replace(soft_hyphen, '') + + assert _spell(0) == "zero" + assert _spell(1) == "one" + assert _spell(2) == "two" + assert _spell(3) == "three" + # assert _spell('-0') == "minus zero" + # assert _spell(123.25) == "one hundred and twenty-three point twenty-five hundredths" + assert _spell(-12) == "minus twelve" + assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-nine" + # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-nine" + assert _spell(1950) == "one thousand nine hundred fifty" + # assert _spell(1950) == "one thousand nine hundred and fifty" + assert _spell(2001) == "two thousand one" + # assert _spell('1999.238') == "one thousand nine hundred and ninety-nine point two hundred and thirty-eight thousandths" + # assert _spell(-.199923862, precision=3, state_rounded=True) == "approximately minus zero point two tenths" + # assert _spell(-.1) == "minus zero point one tenth" # float to string conversion preserves precision + + + def test_en_GB_ordinal(self): + def _spell(x): + return numbers.spell_number(x, locale='en_GB', ordinal=True).replace(soft_hyphen, '') + + assert _spell(0) == "zeroth" + assert _spell(1) == "first" + assert _spell(2) == "second" + assert _spell(3) == "third" + assert _spell(4) == "fourth" + assert _spell(5) == "fifth" + assert _spell(6) == "sixth" + assert _spell(7) == "seventh" + assert _spell(8) == "eighth" + assert _spell(9) == "ninth" + assert _spell(10) == "tenth" + assert _spell(11) == "eleventh" + assert _spell(12) == "twelfth" + assert _spell(13) == "thirteenth" + assert _spell(20) == "twentieth" + assert _spell(30) == "thirtieth" + assert _spell(40) == "fortieth" + # assert _spell(40) == "fourtieth" + assert _spell(-12) == "minus twelfth" + # assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-ninth" # apostrophes + # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-ninth" + assert _spell(1950) == "one thousand nine hundred fiftieth" + # assert _spell(1950) == "one thousand nine hundred and fiftieth" + assert _spell(2001) == "two thousand first" + + + +# def test_hu_HU_error(): +# with pytest.raises(exceptions.TooBigToSpell) as excinfo: +# _spell(10**66, ordinal=True) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='hu_HU', precision=7) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752) + +# with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: +# _spell('1999.23862', ordinal=True) + +# def test_en_GB_error(): +# with pytest.raises(exceptions.TooBigToSpell) as excinfo: +# _spell(10**24, ordinal=True, locale='en_GB') + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='en_GB', precision=4) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='en_GB') + +# with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: +# _spell('1999.23', ordinal=True, locale='en_GB') + + From 47051732a957532ccd51730452ebbf2958ce30fc Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 12:22:05 +0200 Subject: [PATCH 2/9] rbnf: correct radix reading --- babel/rbnf.py | 20 +++++--------------- scripts/import_cldr.py | 7 ++++--- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/babel/rbnf.py b/babel/rbnf.py index 2a5d24275..4b28fd037 100644 --- a/babel/rbnf.py +++ b/babel/rbnf.py @@ -619,16 +619,12 @@ def __init__(self, value, text, radix=None): if value in self.specials: self.value = value else: - try: - self.value = int(value) - except: - warnings.warn("Unknown rule value: [%s]" % value, SyntaxWarning) + self.value = int(value) self.text = text - self._radix = radix - - self._parse(text) + self.radix = int(radix or 10) + self._parse(text) def apply(self, number, context): """ @@ -685,14 +681,8 @@ def divisor(self): if isinstance(self.value, int): if self.value == 0: return 1 - exp = decimal.Decimal(self.value).ln()/decimal.Decimal(self.radix).ln() - return int(self.radix**math.floor(exp)) - - - @property - def radix(self): - return self._radix or 10 - + exp = decimal.Decimal(self.value).ln() / decimal.Decimal(self.radix).ln() + return int(self.radix ** math.floor(exp)) @property def substitutions(self): diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index 531d8528b..bd6967b2f 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -225,7 +225,6 @@ def parse_global(srcdir, sup): all_currencies = collections.defaultdict(set) currency_fractions = global_data.setdefault('currency_fractions', {}) territory_languages = global_data.setdefault('territory_languages', {}) - rbnf_locales = global_data.setdefault('rbnf_locales', []) bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml')) sup_windows_zones = parse(os.path.join(sup_dir, 'windowsZones.xml')) sup_metadata = parse(os.path.join(sup_dir, 'supplementalMetadata.xml')) @@ -1015,8 +1014,11 @@ def parse_rbnf_rules(data, tree): ruleset_obj = rbnf.Ruleset(ruleset_name, private) for rule in ruleset.findall('rbnfrule'): radix = rule.attrib.get('radix') + if radix == "1,000": # HACK: work around misspelled radix in mt.xml + radix = "1000" try: rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix) + ruleset_obj.rules.append(rule_obj) except rbnf.TokenizationError as e: log('%s: Unable to parse rule "%s%s: %s "' % ( data['locale_id'], @@ -1024,8 +1026,7 @@ def parse_rbnf_rules(data, tree): rule.text, '' if radix is None else ('/%s' % radix), )) - ruleset_obj.rules.append(rule_obj) - rbnf_rules[group_name].append(ruleset_obj) + rbnf_rules[group_name].append(ruleset_obj) if __name__ == '__main__': From 1e3c5251499781ff5848f35627b2b3b40df6b65a Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 12:51:03 +0200 Subject: [PATCH 3/9] rbnf: light clean up --- babel/rbnf.py | 125 ++++++++++++++++------------------ tests/test_number_spelling.py | 17 ++--- 2 files changed, 62 insertions(+), 80 deletions(-) diff --git a/babel/rbnf.py b/babel/rbnf.py index 4b28fd037..60540ccb2 100644 --- a/babel/rbnf.py +++ b/babel/rbnf.py @@ -33,7 +33,6 @@ from __future__ import unicode_literals import re -import sys import math import decimal import collections @@ -50,15 +49,18 @@ OPT_START = 7 OPT_END = 8 -regex = [ - (PLURAL_TOKEN, r"\$\((.+)\)\$"), - (INTEGRAL_TOKEN, r"←([^←[]*)←(←?)"), - (PREVIOUS_TOKEN, r"→→→"), - (REMAINDER_TOKEN, r"→([^→[]*)→"), - (SUBSTITUTION_TOKEN, r"=([^=[]+)="), - (OPT_START, r"\["), - (OPT_END, r"\]"), - (TEXT_TOKEN, r"[^[\]=→←]+"), +token_regexes = [ + (t, re.compile(r)) + for (t, r) in [ + (PLURAL_TOKEN, r"\$\((.+)\)\$"), + (INTEGRAL_TOKEN, r"←([^←[]*)←(←?)"), + (PREVIOUS_TOKEN, r"→→→"), + (REMAINDER_TOKEN, r"→([^→[]*)→"), + (SUBSTITUTION_TOKEN, r"=([^=[]+)="), + (OPT_START, r"\["), + (OPT_END, r"\]"), + (TEXT_TOKEN, r"[^[\]=→←]+"), + ] ] INTERNAL_REF = 1 @@ -76,6 +78,8 @@ INFINITY_RULE = 'Inf' NOT_A_NUMBER_RULE = 'NaN' SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR + + # locale.number_symbols['decimal'] # normal rule means a number is specified @@ -86,9 +90,6 @@ class RulesetNotFound(RBNFError): pass class RuleNotFound(RBNFError): pass TokenInfo = collections.namedtuple('TokenInfo', 'type reference optional') - -# compile regex -regex_comp = [(t, re.compile(r)) for t, r in regex] def tokenize(text): @@ -97,37 +98,41 @@ def tokenize(text): Text parsed by matching a list of regular expressions against the beginning of the text. If the regex match - a token is generated and we continue with the rest of + a token is generated, and we continue with the rest of the text. - Some of the tokens are optional if they are in squared - brackets. From regular expressions for the begining and + Some tokens are optional if they are in squared + brackets. From regular expressions for the beginning and end of the optional section no tokens are generated. - Instead all the tokens inside the optional section are - flaged as optional. + Instead, all the tokens inside the optional section are + flagged as optional. - Some of them tokens are referencing other rulesets by name - this information is stored in the token along with the type + Some of the tokens are referencing other rulesets by name. + This information is stored in the token along with the type of reference. """ - # remove uneccesarry syntax (only used in the non-xml form) - if text.endswith(";"): text = text[:-1] - if text.startswith("'"): text = text[1:] + # remove unnecessary syntax (only used in the non-xml form) + if text.endswith(";"): + text = text[:-1] + if text.startswith("'"): + text = text[1:] optional = False while text: stop = True # print("TEXT: ", text) - for tok, regex in regex_comp: + for tok, regex in token_regexes: # print(token, regex) match = regex.match(text) if match: stop = False text = text[match.end():] - if tok == OPT_START: optional = True - elif tok == OPT_END: optional = False + if tok == OPT_START: + optional = True + elif tok == OPT_END: + optional = False else: token = _gen_token(tok, match, optional) if token: @@ -138,7 +143,7 @@ def tokenize(text): def _gen_token(tok, match, optional): - # remove this if CLCR is updated based on ticket + # remove this if CLDR is updated based on ticket # http://unicode.org/cldr/trac/ticket/10544 if tok == INTEGRAL_TOKEN and match.group(2) == '←': warnings.warn('Unsupported syntax ←...←←', SyntaxWarning) @@ -171,13 +176,6 @@ def _parse_reference(string): return INTERNAL_REF, "" # defaults to this -def untokenize_ICU(): - """ - TODO implement ICU style representation - rather make Ruleset.format_icu() - """ - - class RuleBasedNumberFormat(object): """ RuleBasedNumberFormat's behavior consists of one or more rule sets @@ -206,6 +204,7 @@ class RuleBasedNumberFormat(object): the default rule set for this formatter. """ group_types = ('SpelloutRules', 'OrdinalRules', 'NumberingSystemRules') + # spell number should go for Spelloutrules # make interface for the other two groups @@ -434,15 +433,15 @@ class Ruleset(object): SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR """ + def __init__(self, name, private=False): self.name = name self.private = private self.rules = [] - def apply(self, number, parent, fractional=False): number = decimal.Decimal(str(number)) - # str is needed to avoid unecessary precision + # str is needed to avoid unnecessary precision # decimal is necessary for exact representation in fraction rules context = { @@ -451,7 +450,7 @@ def apply(self, number, parent, fractional=False): 'fractional': fractional, 'omit_optional': False, # no default value is defined in the spec SUBSTITUTION_TOKEN: number, - 'remainder_as_fractional': False # format remainder as fractional rule? + 'remainder_as_fractional': False # format remainder as fractional rule? } integral, remainder = divmod(number, 1) @@ -469,7 +468,7 @@ def apply(self, number, parent, fractional=False): # negative number rule if number < 0: - rule = self.get_rule_special(NEGATIVE_NUMBER_RULE) + rule = self.get_rule_special(NEGATIVE_NUMBER_RULE) if rule is None: raise RuleNotFound("negative number rule (%s)" % NEGATIVE_NUMBER_RULE) context[REMAINDER_TOKEN] = abs(number) @@ -507,22 +506,20 @@ def apply(self, number, parent, fractional=False): i, r = divmod(integral, rule.divisor) context[REMAINDER_TOKEN] = r context[INTEGRAL_TOKEN] = i - context[PREVIOUS_TOKEN] = index-1 # get rule using ruleset - context['omit_optional'] = r != 0 # only if not even multiple (TODO no need to store separatelly) + context[PREVIOUS_TOKEN] = index - 1 # get rule using ruleset + context['omit_optional'] = r != 0 # only if not even multiple (TODO no need to store separately) return rule.apply(number, context) - def get_rule_special(self, val, strict=False): if val in Rule.specials: for r in self.rules: if r.value == val: return r - - # return last rule if no match occured and strict is false + + # return last rule if no match occurred and strict is false if not strict: return self.rules[-1] - def get_rule_integral(self, val): """ Binary-search the rule list for the rule with the highest base value @@ -534,13 +531,13 @@ def get_rule_integral(self, val): it in the rule list. Otherwise, use the rule itself. """ # automatically return last rule if no range matched - ret = len(self.rules)-1 + ret = len(self.rules) - 1 - for i in range(len(self.rules)-1): + for i in range(len(self.rules) - 1): if self.rules[i].value in Rule.specials: continue - - if self.rules[i].value <= val < self.rules[i+1].value: + + if self.rules[i].value <= val < self.rules[i + 1].value: ret = i break @@ -553,16 +550,15 @@ def get_rule_integral(self, val): return ret - def get_rule_fractional(self, val): """If the rule set is a fraction rule set, do the following: Ignore negative-number and fraction rules. - + For each rule in the list, multiply the number being formatted (which will always be between 0 and 1) by the rule's base value. Keep track of the distance between the result and the nearest integer. - + Use the rule that produced the result closest to zero in the above calculation. In the event of a tie or a direct hit, use the first matching rule encountered. (The idea here is to try each rule's base @@ -582,35 +578,33 @@ def get_rule_fractional(self, val): for i, rule in enumerate(self.rules): if rule.value in Rule.specials or rule.value == 0: # ignore specials and 0 rules continue - d = abs(round(val*rule.value) - val*rule.value) + d = abs(round(val * rule.value) - val * rule.value) dists.append((i, d)) # get the index of the closest 0 match bst = min(dists, key=lambda x: x[1])[0] # there is a following rule - if len(self.rules) > bst+1 and \ - self.rules[bst].value == self.rules[bst+1].value and \ - val*self.rules[bst].value > 1: + if len(self.rules) > bst + 1 and \ + self.rules[bst].value == self.rules[bst + 1].value and \ + val * self.rules[bst].value > 1: bst += 1 return bst - def __repr__(self): - return 'Ruleset %s %s\n%s\n' % (self.name, self.private, '\n'.join(['\t'+str(r) for r in self.rules])) + return 'Ruleset %s %s\n%s\n' % (self.name, self.private, '\n'.join(['\t' + str(r) for r in self.rules])) class Rule(object): """ base value, a divisor, rule text, and zero, one, or two substitutions. """ - specials = ( + specials = { NEGATIVE_NUMBER_RULE, IMPROPER_FRACTION_RULE, PROPER_FRACTION_RULE, MASTER_RULE, INFINITY_RULE, NOT_A_NUMBER_RULE, SPECIAL_FRACTION_RULE, - ) - + } def __init__(self, value, text, radix=None): """ @@ -647,8 +641,7 @@ def apply(self, number, context): ruleset = context['search_at'].get_ruleset(ref) elif ref_type == DECIMAL_REF: loc = context['search_at']._locale - x = numbers.format_decimal(number, format=ref, locale=loc) - res.append(x) + res.append(format_decimal(number, format=ref, locale=loc)) if ruleset: if t.type == REMAINDER_TOKEN and context['remainder_as_fractional']: @@ -671,10 +664,8 @@ def apply(self, number, context): else: raise ValueError('unknown token %s', t) - return ''.join(res) - @property def divisor(self): """it is highest exponent of radix less then or equal to the rules's base""" @@ -688,16 +679,14 @@ def divisor(self): def substitutions(self): return len([t for t in self.tokens if t.type in REFERENCE_TOKENS]) - def _parse(self, text): try: self.tokens = [t for t in tokenize(text)] except ValueError: - raise TokenizationError(self.text) - + raise TokenizationError(text) def __repr__(self): return 'Rule %s (%s) - %s\n%s\n' % ( self.value, self.text, self.radix, - '\n'.join(['\t\t'+str(t) for t in self.tokens])) + '\n'.join(['\t\t' + str(t) for t in self.tokens])) diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py index bcfcf8644..3a8024e93 100644 --- a/tests/test_number_spelling.py +++ b/tests/test_number_spelling.py @@ -1,26 +1,25 @@ import unittest + import pytest from babel import numbers from babel import rbnf -from babel.core import get_global from babel.localedata import locale_identifiers soft_hyphen = '\xad' + class TestRuleEngine(unittest.TestCase): """ Test everything related to the rules engine """ + def test_basic(self): x = rbnf.RuleBasedNumberFormat.negotiate('hu_HU') assert str(x._locale) == 'hu' assert 'spellout-numbering' in x.available_rulesets - def test_negotiation(self): - valid_ruleset_groups = ("SpelloutRules", "OrdinalRules", "NumberingSystemRules") - for lid in locale_identifiers(): loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale if loc is None: @@ -29,8 +28,7 @@ def test_negotiation(self): else: # test groups for k in loc._data['rbnf_rules']: - assert k in valid_ruleset_groups - + assert k in rbnf.RuleBasedNumberFormat.group_types def test_tokenization(self): @@ -41,7 +39,6 @@ def test_tokenization(self): ] assert x == res - def test_xml_parsing(self): """ all the rules should be able to go through the parser and tokenizer @@ -59,6 +56,7 @@ class TestSpelling(unittest.TestCase): """ Locale specific tests """ + def test_hu_HU_cardinal(self): def _spell(x): return numbers.spell_number(x, locale='hu_HU').replace(soft_hyphen, '') @@ -81,7 +79,6 @@ def _spell(x): # assert _spell(-.199923862) == "kerekítve mínusz nulla egész ezerkilencszázkilencvenkilenc tízezred" # assert _spell(.4326752) == "nulla egész negyvenhárom század" - def test_hu_HU_ordinal(self): def _spell(x): return numbers.spell_number(x, locale='hu_HU', ordinal=True).replace(soft_hyphen, '') @@ -101,7 +98,6 @@ def _spell(x): assert _spell(1950) == "ezerkilencszázötvenedik" # assert _spell(2001) == "kétezer-egyedik" - def test_en_GB_cardinal(self): def _spell(x): return numbers.spell_number(x, locale='en_GB').replace(soft_hyphen, '') @@ -122,7 +118,6 @@ def _spell(x): # assert _spell(-.199923862, precision=3, state_rounded=True) == "approximately minus zero point two tenths" # assert _spell(-.1) == "minus zero point one tenth" # float to string conversion preserves precision - def test_en_GB_ordinal(self): def _spell(x): return numbers.spell_number(x, locale='en_GB', ordinal=True).replace(soft_hyphen, '') @@ -179,5 +174,3 @@ def _spell(x): # with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: # _spell('1999.23', ordinal=True, locale='en_GB') - - From a6b6d2d5bfb36fd9675c27fcb9b22541c0f5c424 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 12:40:47 +0200 Subject: [PATCH 4/9] rbnf: make spell_number API less kwargsy --- babel/numbers.py | 6 +-- babel/rbnf.py | 98 ++++++++++++++++++++++------------- tests/test_number_spelling.py | 18 +++---- 3 files changed, 75 insertions(+), 47 deletions(-) diff --git a/babel/numbers.py b/babel/numbers.py index d9a0c0462..12e82d219 100644 --- a/babel/numbers.py +++ b/babel/numbers.py @@ -663,15 +663,15 @@ def __init__(self, message, suggestions=None): self.suggestions = suggestions -def spell_number(number, locale=LC_NUMERIC, **kwargs): +def spell_number(number, locale=LC_NUMERIC, ruleset=None): """Return value spelled out for a specific locale :param number: the number to format :param locale: the `Locale` object or locale identifier - :param kwargs: optional locale specific parameters + :param ruleset: the ruleset to use; defaults to regular numbers. """ speller = RuleBasedNumberFormat.negotiate(locale) - return speller.format(number, **kwargs) + return speller.format(number, ruleset=ruleset) def get_rbnf_rules(locale=LC_NUMERIC): diff --git a/babel/rbnf.py b/babel/rbnf.py index 60540ccb2..fbcec253a 100644 --- a/babel/rbnf.py +++ b/babel/rbnf.py @@ -84,10 +84,25 @@ # normal rule means a number is specified -class RBNFError(Exception): pass -class TokenizationError(RBNFError): pass -class RulesetNotFound(RBNFError): pass -class RuleNotFound(RBNFError): pass +class RBNFError(Exception): + pass + + +class TokenizationError(RBNFError): + pass + + +class RulesetNotFound(RBNFError): + pass + + +class RuleNotFound(RBNFError): + pass + + +class RulesetSubstitutionWarning(UserWarning): + pass + TokenInfo = collections.namedtuple('TokenInfo', 'type reference optional') @@ -221,45 +236,56 @@ def available_rulesets(self): """list available public rulesets""" return [r.name for r in self.rulesets if not r.private] - - def format(self, number, ordinal=False, year=False, ruleset=None, **kwargs): - """spell an actual number (int/float/decimal) - - Search available_rulesets for an entry point - default is `spellout-numbering`. - - If year is True: use spellout-numbering-year - If ordinal is True: use spellout-ordinal - If year and ordinal both True: raise error - - TODO - If no `spellout-ordinal`: - if has `spellout-ordinal-*`: use first one, issue warning - + def _find_matching_ruleset(self, prefix): + available_rulesets = self.available_rulesets + if prefix in available_rulesets: + return (prefix, True) + # Sorting here avoids use of more specific ("spellout-ordinal-sinokorean-count") + # rulesets when a shorter one might be available. + for ruleset in sorted(available_rulesets): + if ruleset.startswith(prefix): + return (ruleset, False) + return (None, False) + + def match_ruleset(self, ruleset): """ - if ordinal and year: - raise ValueError('both ordinal and year is not possible') - if ordinal: - search = ruleset or 'spellout-ordinal' - elif year: - search = ruleset or 'spellout-year' - else: - search = ruleset or 'spellout-numbering' - - ruleset = self.get_ruleset(search) - - if ruleset is None: - raise RulesetNotFound(search) - - return ruleset.apply(number, self) + Try to find a matching ruleset given a ruleset name or alias ("year", "ordinal"). + """ + if ruleset == "year": + ruleset = "spellout-numbering-year" + elif ruleset == "ordinal": + ruleset, exact_match = self._find_matching_ruleset("spellout-ordinal") + if not ruleset: + raise RulesetNotFound("No ordinal ruleset is available for %s" % ( + self._locale, + )) + if not exact_match: + warnings.warn("Using non-specific ordinal ruleset %s" % ruleset, RulesetSubstitutionWarning) + ruleset_obj = self.get_ruleset(ruleset) + if not ruleset_obj: + raise RulesetNotFound("Ruleset %r is not one of the ones available for %s: %r" % ( + ruleset, + self._locale, + self.available_rulesets, + )) + return ruleset_obj + + def format(self, number, ruleset=None): + """Format a number (int/float/decimal) with spelling rules. + + Ruleset may be an actual ruleset name for the locale, + or one of the aliases "year" or "ordinal". + """ + if not ruleset: + ruleset = "spellout-numbering" + return self.match_ruleset(ruleset).apply(number, self) def get_ruleset(self, name): for r in self.rulesets: if r.name == name: return r - @classmethod def negotiate(cls, locale): """ @@ -267,6 +293,8 @@ def negotiate(cls, locale): Caching is not necessary the Locale object does that pretty well """ loc = Locale.negotiate([str(Locale.parse(locale))], get_global('rbnf_locales')) + if not loc: + raise RulesetNotFound("No RBNF rules available for %s" % locale) return cls(loc) diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py index 3a8024e93..65815b40b 100644 --- a/tests/test_number_spelling.py +++ b/tests/test_number_spelling.py @@ -21,14 +21,14 @@ def test_basic(self): def test_negotiation(self): for lid in locale_identifiers(): - loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale - if loc is None: + try: + loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale + except rbnf.RulesetNotFound: # generate warning if necessary - pass - else: - # test groups - for k in loc._data['rbnf_rules']: - assert k in rbnf.RuleBasedNumberFormat.group_types + continue + # test groups + for k in loc._data['rbnf_rules']: + assert k in rbnf.RuleBasedNumberFormat.group_types def test_tokenization(self): @@ -81,7 +81,7 @@ def _spell(x): def test_hu_HU_ordinal(self): def _spell(x): - return numbers.spell_number(x, locale='hu_HU', ordinal=True).replace(soft_hyphen, '') + return numbers.spell_number(x, locale='hu_HU', ruleset="ordinal").replace(soft_hyphen, '') assert _spell(0) == "nulla" # assert _spell(0) == "nulladik" @@ -120,7 +120,7 @@ def _spell(x): def test_en_GB_ordinal(self): def _spell(x): - return numbers.spell_number(x, locale='en_GB', ordinal=True).replace(soft_hyphen, '') + return numbers.spell_number(x, locale='en_GB', ruleset="ordinal").replace(soft_hyphen, '') assert _spell(0) == "zeroth" assert _spell(1) == "first" From ba6bbc2a737dabcbb504f31fa3c2bfc0c8aac69d Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 13:11:36 +0200 Subject: [PATCH 5/9] rbnf: store divisor and substitutions in Rule to avoid recomputation --- babel/rbnf.py | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/babel/rbnf.py b/babel/rbnf.py index fbcec253a..6df8d5932 100644 --- a/babel/rbnf.py +++ b/babel/rbnf.py @@ -191,6 +191,17 @@ def _parse_reference(string): return INTERNAL_REF, "" # defaults to this +def compute_divisor(value, radix): + # Compute the highest exponent of radix less than or equal to the rule's base + if isinstance(value, int): + if value == 0: + return 1 + exp = decimal.Decimal(value).ln() / decimal.Decimal(radix).ln() + return int(radix ** math.floor(exp)) + else: + return None + + class RuleBasedNumberFormat(object): """ RuleBasedNumberFormat's behavior consists of one or more rule sets @@ -638,15 +649,16 @@ def __init__(self, value, text, radix=None): """ divisor : iterator of literal, back_sub, fwd_sub, lit_exact elements parsed from rule """ + self.radix = int(radix or 10) if value in self.specials: self.value = value else: self.value = int(value) + self.divisor = compute_divisor(self.value, self.radix) self.text = text - self.radix = int(radix or 10) - - self._parse(text) + self.tokens = list(tokenize(text)) + self.substitutions = len([t for t in self.tokens if t.type in REFERENCE_TOKENS]) def apply(self, number, context): """ @@ -694,25 +706,6 @@ def apply(self, number, context): return ''.join(res) - @property - def divisor(self): - """it is highest exponent of radix less then or equal to the rules's base""" - if isinstance(self.value, int): - if self.value == 0: - return 1 - exp = decimal.Decimal(self.value).ln() / decimal.Decimal(self.radix).ln() - return int(self.radix ** math.floor(exp)) - - @property - def substitutions(self): - return len([t for t in self.tokens if t.type in REFERENCE_TOKENS]) - - def _parse(self, text): - try: - self.tokens = [t for t in tokenize(text)] - except ValueError: - raise TokenizationError(text) - def __repr__(self): return 'Rule %s (%s) - %s\n%s\n' % ( self.value, self.text, From efb57245f57ae4cf554ba372c94a7d7d86ca7ed7 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 13:15:05 +0200 Subject: [PATCH 6/9] rbnf: eagerly evaluate self.rulesets to avoid alias lookup every time --- babel/rbnf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/babel/rbnf.py b/babel/rbnf.py index 6df8d5932..29d9ea4cf 100644 --- a/babel/rbnf.py +++ b/babel/rbnf.py @@ -237,10 +237,7 @@ class RuleBasedNumberFormat(object): def __init__(self, locale, group='SpelloutRules'): self._locale = locale self._group = group - - @property - def rulesets(self): - return self._locale._data['rbnf_rules'][self._group] + self.rulesets = self._locale._data['rbnf_rules'][self._group] @property def available_rulesets(self): From 777eea610df35aab4806006041d6d1d2bc9f6960 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 13:16:45 +0200 Subject: [PATCH 7/9] rbnf: correctly dump rulesets/rules to JSON file --- scripts/import_cldr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index bd6967b2f..d3e05940b 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -152,6 +152,8 @@ def _compact_dict(dict): def debug_repr(obj): if isinstance(obj, PluralRule): return obj.abstract + if isinstance(obj, (rbnf.Ruleset, rbnf.Rule)): + return vars(obj) return repr(obj) From 8113d1bf9492a5c78239eec24479c4930e418b9d Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 28 Jan 2022 17:25:28 +0200 Subject: [PATCH 8/9] rbnf: replace .format & friends with f-strings --- babel/rbnf.py | 48 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/babel/rbnf.py b/babel/rbnf.py index 29d9ea4cf..77f771240 100644 --- a/babel/rbnf.py +++ b/babel/rbnf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ babel.rbnf ~~~~~~~~~~ @@ -30,7 +29,6 @@ # Original request for Hebrew (currently not used in Hebrew): # http://bugs.icu-project.org/trac/ticket/4039 -from __future__ import unicode_literals import re import math @@ -187,7 +185,7 @@ def _parse_reference(string): return PUBLIC_REF, string[1:] if string[0] in '0#': return DECIMAL_REF, string - warnings.warn('Reference parsing error: %s' % string, SyntaxWarning) + warnings.warn(f'Reference parsing error: {string}', SyntaxWarning) return INTERNAL_REF, "" # defaults to this @@ -202,7 +200,7 @@ def compute_divisor(value, radix): return None -class RuleBasedNumberFormat(object): +class RuleBasedNumberFormat: """ RuleBasedNumberFormat's behavior consists of one or more rule sets @@ -264,18 +262,15 @@ def match_ruleset(self, ruleset): elif ruleset == "ordinal": ruleset, exact_match = self._find_matching_ruleset("spellout-ordinal") if not ruleset: - raise RulesetNotFound("No ordinal ruleset is available for %s" % ( - self._locale, - )) + raise RulesetNotFound(f"No ordinal ruleset is available for {self._locale}") if not exact_match: - warnings.warn("Using non-specific ordinal ruleset %s" % ruleset, RulesetSubstitutionWarning) + warnings.warn(f"Using non-specific ordinal ruleset {ruleset}", RulesetSubstitutionWarning) ruleset_obj = self.get_ruleset(ruleset) if not ruleset_obj: - raise RulesetNotFound("Ruleset %r is not one of the ones available for %s: %r" % ( - ruleset, - self._locale, - self.available_rulesets, - )) + raise RulesetNotFound( + f"Ruleset {ruleset!r} is not one of the ones available for " + f"{self._locale}: {self.available_rulesets!r}" + ) return ruleset_obj def format(self, number, ruleset=None): @@ -302,11 +297,11 @@ def negotiate(cls, locale): """ loc = Locale.negotiate([str(Locale.parse(locale))], get_global('rbnf_locales')) if not loc: - raise RulesetNotFound("No RBNF rules available for %s" % locale) + raise RulesetNotFound(f"No RBNF rules available for {locale}") return cls(loc) -class Ruleset(object): +class Ruleset: """ Each rule set consists of a name, a colon, and a list of rules. (in the ICU syntax, CLDR differs because of XML) @@ -496,7 +491,7 @@ def apply(self, number, parent, fractional=False): if fractional: index = self.get_rule_fractional(remainder) if index is None: - raise RuleNotFound("rule for fractional processing of %s" % remainder) + raise RuleNotFound(f"rule for fractional processing of {remainder}") rule = self.rules[index] context[INTEGRAL_TOKEN] = rule.value * remainder # here remainder == number context['omit_optional'] = rule.value * number == 1 @@ -506,7 +501,7 @@ def apply(self, number, parent, fractional=False): if number < 0: rule = self.get_rule_special(NEGATIVE_NUMBER_RULE) if rule is None: - raise RuleNotFound("negative number rule (%s)" % NEGATIVE_NUMBER_RULE) + raise RuleNotFound(f"negative number rule ({NEGATIVE_NUMBER_RULE})") context[REMAINDER_TOKEN] = abs(number) return rule.apply(number, context) @@ -524,12 +519,12 @@ def apply(self, number, parent, fractional=False): if integral == 0: rule = self.get_rule_special(PROPER_FRACTION_RULE) if rule is None: - raise RuleNotFound("proper fraction rule (%s)" % PROPER_FRACTION_RULE) + raise RuleNotFound(f"proper fraction rule ({PROPER_FRACTION_RULE})") else: rule = self.get_rule_special(IMPROPER_FRACTION_RULE) if rule is None: - raise RuleNotFound("improper fraction rule (%s)" % IMPROPER_FRACTION_RULE) + raise RuleNotFound(f"improper fraction rule ({IMPROPER_FRACTION_RULE})") context['omit_optional'] = 0 < number < 1 # between 0 and 1 return rule.apply(number, context) @@ -537,7 +532,7 @@ def apply(self, number, parent, fractional=False): # normal rule index = self.get_rule_integral(integral) if index is None: - raise RuleNotFound("normal rule for %s" % integral) + raise RuleNotFound(f"normal rule for {integral}") rule = self.rules[index] i, r = divmod(integral, rule.divisor) context[REMAINDER_TOKEN] = r @@ -629,10 +624,11 @@ def get_rule_fractional(self, val): return bst def __repr__(self): - return 'Ruleset %s %s\n%s\n' % (self.name, self.private, '\n'.join(['\t' + str(r) for r in self.rules])) + rules_str = '\n'.join(['\t' + str(r) for r in self.rules]) + return f'Ruleset {self.name} {self.private}\n{rules_str}\n' -class Rule(object): +class Rule: """ base value, a divisor, rule text, and zero, one, or two substitutions. """ @@ -699,12 +695,10 @@ def apply(self, number, context): )) else: - raise ValueError('unknown token %s', t) + raise ValueError(f'unknown token {t}', t) return ''.join(res) def __repr__(self): - return 'Rule %s (%s) - %s\n%s\n' % ( - self.value, self.text, - self.radix, - '\n'.join(['\t\t' + str(t) for t in self.tokens])) + tokens_str = '\n'.join(['\t\t' + str(t) for t in self.tokens]) + return f'Rule {self.value} ({self.text}) - {self.radix}\n{tokens_str}\n' From 7e9c9bcf2a63c87670bc7fd1bb9291bd731bd024 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 31 Dec 2019 12:08:30 +0200 Subject: [PATCH 9/9] Add smoke test for all RBNF-enabled locales and rulesets --- setup.cfg | 1 + tests/conftest.py | 4 ++++ tests/test_number_spelling.py | 9 +++++++++ 3 files changed, 14 insertions(+) diff --git a/setup.cfg b/setup.cfg index 12585f0d7..c7f0a0644 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,7 @@ norecursedirs = venv* .* _* scripts {args} doctest_optionflags = ELLIPSIS NORMALIZE_WHITESPACE ALLOW_UNICODE IGNORE_EXCEPTION_DETAIL markers = all_locales: parameterize test with all locales + all_rbnf_locales: parameterize test with all locales with RBNF rules [bdist_wheel] universal = 1 diff --git a/tests/conftest.py b/tests/conftest.py index 5b14b1ca7..86c0adf45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,3 +16,7 @@ def pytest_generate_tests(metafunc): from babel.localedata import locale_identifiers metafunc.parametrize("locale", list(locale_identifiers())) break + if mark.name == "all_rbnf_locales": + from babel.core import get_global + metafunc.parametrize("locale", list(get_global('rbnf_locales'))) + break diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py index 65815b40b..6c98ea6af 100644 --- a/tests/test_number_spelling.py +++ b/tests/test_number_spelling.py @@ -148,6 +148,15 @@ def _spell(x): assert _spell(2001) == "two thousand first" +@pytest.mark.all_rbnf_locales +@pytest.mark.parametrize('ruleset', (None, 'year', 'ordinal')) +def test_spelling_smoke(locale, ruleset): + try: + assert numbers.spell_number(2020, locale=locale, ruleset=ruleset) + except rbnf.RulesetNotFound: # Not all locales have all rulesets, so skip the smoke test. + pass + except RecursionError: # Some combinations currently fail with this :( + pytest.xfail(f'Locale {locale}, ruleset {ruleset}') # def test_hu_HU_error(): # with pytest.raises(exceptions.TooBigToSpell) as excinfo: