From 1b0d7ce3727ac9e1dd026d7dc8dd144878b8489e Mon Sep 17 00:00:00 2001 From: Szabolcs Date: Sun, 21 Jul 2019 20:41:03 +0200 Subject: [PATCH] Number spelling based on the CLDR's RBNF rules A pure Python engine for parsing RBNF rules. The rules are incomplete in many cases, fractional number spelling is hardly supported. Based on an earlier discussion: https://github.com/python-babel/babel/pull/114 and referenced in https://github.com/python-babel/babel/issues/179 --- babel/numbers.py | 21 + babel/rbnf.py | 713 ++++++++++++++++++++++++++++++++++ scripts/import_cldr.py | 47 +++ tests/test_number_spelling.py | 187 +++++++++ 4 files changed, 968 insertions(+) create mode 100644 babel/rbnf.py create mode 100644 tests/test_number_spelling.py diff --git a/babel/numbers.py b/babel/numbers.py index 6888c9cb4..5e3ee1b16 100644 --- a/babel/numbers.py +++ b/babel/numbers.py @@ -24,6 +24,7 @@ from babel.core import default_locale, Locale, get_global from babel._compat import decimal, string_types +from babel.rbnf import RuleBasedNumberFormat try: # Python 2 @@ -640,6 +641,26 @@ def __init__(self, message, suggestions=None): self.suggestions = suggestions +def spell_number(number, locale=LC_NUMERIC, **kwargs): + """Return value spelled out for a specific locale + + :param number: the number to format + :param locale: the `Locale` object or locale identifier + :param kwargs: optional locale specific parameters + """ + speller = RuleBasedNumberFormat.negotiate(locale) + return speller.format(number, **kwargs) + + +def get_rbnf_rules(locale=LC_NUMERIC): + """Return all the available public rules for a specific locale + + :param locale: the `Locale` object or locale identifier + """ + speller = RuleBasedNumberFormat.negotiate(locale) + return speller.available_rulesets + + def parse_number(string, locale=LC_NUMERIC): """Parse localized number string into an integer. diff --git a/babel/rbnf.py b/babel/rbnf.py new file mode 100644 index 000000000..2a5d24275 --- /dev/null +++ b/babel/rbnf.py @@ -0,0 +1,713 @@ +# -*- coding: utf-8 -*- +""" +babel.rbnf +~~~~~~~~~~ + +Locale dependent spelling of numbers. + +Documentation: +- http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting +- http://www.icu-project.org/apiref/icu4c/classRuleBasedNumberFormat.html + +Examples +- http://userguide.icu-project.org/formatparse/numbers/rbnf-examples +- http://source.icu-project.org/repos/icu/trunk/icu4j/demos/src/com/ibm/icu/dev/demo/rbnf/RbnfSampleRuleSets.java + + +""" +# Dev notes +# +# Reloading cldr: +# python ./scripts/import_cldr.py ./cldr/cldr-core-35.1/common/ -f +# +# Tokenization is inspired by Ka-Ping Yee's tokenize library + +# Undocumented syntax (←%rule-name←←) +# Trac ticket filed for CLDR update PL rbnf +# http://unicode.org/cldr/trac/ticket/10544 +# Maybe the syntax need to be supported: +# http://bugs.icu-project.org/trac/ticket/13264 +# Original request for Hebrew (currently not used in Hebrew): +# http://bugs.icu-project.org/trac/ticket/4039 + +from __future__ import unicode_literals + +import re +import sys +import math +import decimal +import collections +import warnings + +from babel.core import Locale, get_global + +TEXT_TOKEN = 1 +INTEGRAL_TOKEN = 2 +REMAINDER_TOKEN = 3 +PREVIOUS_TOKEN = 4 +SUBSTITUTION_TOKEN = 5 +PLURAL_TOKEN = 6 +OPT_START = 7 +OPT_END = 8 + +regex = [ + (PLURAL_TOKEN, r"\$\((.+)\)\$"), + (INTEGRAL_TOKEN, r"←([^←[]*)←(←?)"), + (PREVIOUS_TOKEN, r"→→→"), + (REMAINDER_TOKEN, r"→([^→[]*)→"), + (SUBSTITUTION_TOKEN, r"=([^=[]+)="), + (OPT_START, r"\["), + (OPT_END, r"\]"), + (TEXT_TOKEN, r"[^[\]=→←]+"), +] + +INTERNAL_REF = 1 +PRIVATE_REF = 2 +PUBLIC_REF = 3 +PLURAL_REF = 4 +DECIMAL_REF = 5 + +REFERENCE_TOKENS = (INTEGRAL_TOKEN, REMAINDER_TOKEN, SUBSTITUTION_TOKEN) + +NEGATIVE_NUMBER_RULE = '-x' +IMPROPER_FRACTION_RULE = 'x.x' +PROPER_FRACTION_RULE = '0.x' +MASTER_RULE = 'x.0' +INFINITY_RULE = 'Inf' +NOT_A_NUMBER_RULE = 'NaN' +SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR +# locale.number_symbols['decimal'] +# normal rule means a number is specified + + +class RBNFError(Exception): pass +class TokenizationError(RBNFError): pass +class RulesetNotFound(RBNFError): pass +class RuleNotFound(RBNFError): pass + +TokenInfo = collections.namedtuple('TokenInfo', 'type reference optional') + +# compile regex +regex_comp = [(t, re.compile(r)) for t, r in regex] + + +def tokenize(text): + """ + Each rule has a list of tokens + + Text parsed by matching a list of regular expressions + against the beginning of the text. If the regex match + a token is generated and we continue with the rest of + the text. + + Some of the tokens are optional if they are in squared + brackets. From regular expressions for the begining and + end of the optional section no tokens are generated. + Instead all the tokens inside the optional section are + flaged as optional. + + Some of them tokens are referencing other rulesets by name + this information is stored in the token along with the type + of reference. + + """ + # remove uneccesarry syntax (only used in the non-xml form) + if text.endswith(";"): text = text[:-1] + if text.startswith("'"): text = text[1:] + + optional = False + + while text: + stop = True + # print("TEXT: ", text) + for tok, regex in regex_comp: + # print(token, regex) + match = regex.match(text) + if match: + stop = False + text = text[match.end():] + if tok == OPT_START: optional = True + elif tok == OPT_END: optional = False + else: + token = _gen_token(tok, match, optional) + if token: + yield token + break # always start searching with the first regex + if stop: + raise ValueError(text) + + +def _gen_token(tok, match, optional): + # remove this if CLCR is updated based on ticket + # http://unicode.org/cldr/trac/ticket/10544 + if tok == INTEGRAL_TOKEN and match.group(2) == '←': + warnings.warn('Unsupported syntax ←...←←', SyntaxWarning) + + if tok in REFERENCE_TOKENS: + reference = _parse_reference(match.group(1)) + return TokenInfo(tok, reference, optional) + + # currently only `en` has this + if tok == PLURAL_TOKEN: + return TokenInfo(tok, (PLURAL_REF, match.group(1)), optional) + + if tok == PREVIOUS_TOKEN: + return TokenInfo(tok, None, optional) + + if tok == TEXT_TOKEN: + return TokenInfo(tok, match.group(0), optional) + + +def _parse_reference(string): + if string == "": + return INTERNAL_REF, "" + if string.startswith('%%'): + return PRIVATE_REF, string[2:] + if string.startswith('%'): + return PUBLIC_REF, string[1:] + if string[0] in '0#': + return DECIMAL_REF, string + warnings.warn('Reference parsing error: %s' % string, SyntaxWarning) + return INTERNAL_REF, "" # defaults to this + + +def untokenize_ICU(): + """ + TODO implement ICU style representation + rather make Ruleset.format_icu() + """ + + +class RuleBasedNumberFormat(object): + """ + RuleBasedNumberFormat's behavior consists of one or more rule sets + + The first ruleset in a locale is the default ruleset. + The substitution descriptor (i.e., the text between the token characters) + may take one of three forms: + :a rule set name: + Perform the mathematical operation on the number, and format the result + using the named rule set. + :a DecimalFormat pattern: + Perform the mathematical operation on the number, and format the + result using a DecimalFormat with the specified pattern. The + pattern must begin with 0 or #. + :nothing: + Perform the mathematical operation on the number, and format the + result using the rule set containing the current rule, except: + + - You can't have an empty substitution descriptor with + a == substitution. + - If you omit the substitution descriptor in a >> substitution + in a fraction rule, format the result one digit at a time + using the rule set containing the current rule. + - If you omit the substitution descriptor in a << substitution + in a rule in a fraction rule set, format the result using + the default rule set for this formatter. + """ + group_types = ('SpelloutRules', 'OrdinalRules', 'NumberingSystemRules') + # spell number should go for Spelloutrules + # make interface for the other two groups + + def __init__(self, locale, group='SpelloutRules'): + self._locale = locale + self._group = group + + @property + def rulesets(self): + return self._locale._data['rbnf_rules'][self._group] + + @property + def available_rulesets(self): + """list available public rulesets""" + return [r.name for r in self.rulesets if not r.private] + + + def format(self, number, ordinal=False, year=False, ruleset=None, **kwargs): + """spell an actual number (int/float/decimal) + + Search available_rulesets for an entry point + default is `spellout-numbering`. + + If year is True: use spellout-numbering-year + If ordinal is True: use spellout-ordinal + If year and ordinal both True: raise error + + TODO + If no `spellout-ordinal`: + if has `spellout-ordinal-*`: use first one, issue warning + + """ + if ordinal and year: + raise ValueError('both ordinal and year is not possible') + if ordinal: + search = ruleset or 'spellout-ordinal' + elif year: + search = ruleset or 'spellout-year' + else: + search = ruleset or 'spellout-numbering' + + ruleset = self.get_ruleset(search) + + if ruleset is None: + raise RulesetNotFound(search) + + return ruleset.apply(number, self) + + + def get_ruleset(self, name): + for r in self.rulesets: + if r.name == name: + return r + + + @classmethod + def negotiate(cls, locale): + """ + Negotiate proper RBNF rules based on global data item `rbnf_locales` + Caching is not necessary the Locale object does that pretty well + """ + loc = Locale.negotiate([str(Locale.parse(locale))], get_global('rbnf_locales')) + return cls(loc) + + +class Ruleset(object): + """ + Each rule set consists of a name, a colon, and a list of rules. + (in the ICU syntax, CLDR differs because of XML) + + If the rule's rule descriptor is left out, the base value is one plus the + preceding rule's base value (or zero if this is the first rule in the list) + in a normal rule set. In a fraction rule set, the base value is the same as + the preceding rule's base value. + + A rule set may be either a regular rule set or a fraction rule set, depending + on whether it is used to format a number's integral part (or the whole number) + or a number's fractional part. Using a rule set to format a rule's fractional + part makes it a fraction rule set. + + Which rule is used to format a number is defined according to one of the + following algorithms: + + REGULAR (NON-FRACTION) PROCESSING + --------------------------------- + If the rule set is a regular rule set, do the following: + + MASTER_RULE + If the rule set includes a master rule (and the number was passed in as a + double), use the master rule. (If the number being formatted was passed + in as a long, the master rule is ignored.) + + NEGATIVE_NUMBER_RULE + If the number is negative, use the negative-number rule. + + IMPROPER_FRACTION_RULE + If the number has a fractional part and is greater than 1, use + the improper fraction rule. + + PROPER_FRACTION_RULE + If the number has a fractional part and is between 0 and 1, use + the proper fraction rule. + + Binary-search the rule list for the rule with the highest base value + less than or equal to the number. If that rule has two substitutions, + its base value is not an even multiple of its divisor, and the number + is an even multiple of the rule's divisor, use the rule that precedes + it in the rule list. Otherwise, use the rule itself. + + FRACTION PROCESSING + ------------------- + If the rule set is a fraction rule set, do the following: + + Ignore negative-number and fraction rules. + + For each rule in the list, multiply the number being formatted (which + will always be between 0 and 1) by the rule's base value. Keep track + of the distance between the result and the nearest integer. + + Use the rule that produced the result closest to zero in the above + calculation. In the event of a tie or a direct hit, use the first + matching rule encountered. (The idea here is to try each rule's base + value as a possible denominator of a fraction. Whichever denominator + produces the fraction closest in value to the number being formatted + wins.) + + If the rule following the matching rule has the same base value, + use it if the numerator of the fraction is anything other than 1; if + the numerator is 1, use the original matching rule. (This is to allow + singular and plural forms of the rule text without a lot of extra hassle.) + + ---- + + A rule's body consists of a string of characters terminated by a semicolon. + The rule may include zero, one, or two substitution tokens, and a range of + text in brackets. The brackets denote optional text (and may also include + one or both substitutions). The exact meanings of the substitution tokens, + and under what conditions optional text is omitted, depend on the syntax + of the substitution token and the context. The rest of the text in a rule + body is literal text that is output when the rule matches the number + being formatted. + + A substitution token begins and ends with a token character. The token + character and the context together specify a mathematical operation to + be performed on the number being formatted. An optional substitution + descriptor specifies how the value resulting from that operation is + used to fill in the substitution. The position of the substitution + token in the rule body specifies the location of the resultant text + in the original rule text. + + The meanings of the substitution token characters are as follows: + + →→ REMAINDER_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the remainder + :in negative-number rule: + Find the absolute value of the number and format the result + :in fraction or master rule: + Isolate the number's fractional part and format it. + :in rule in fraction rule set: + Not allowed. + + →→→ PREVIOUS_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the + remainder, but bypass the normal rule-selection process + and just use the rule that precedes this one in this + rule list. + :in all other rules: + Not allowed. + + ←← INTEGRAL_TOKEN + :in normal rule: + Divide the number by the rule's divisor and format the quotient + :in negative-number rule: + Not allowed. + :in fraction or master rule: + Isolate the number's integral part and format it. + :in rule in fraction rule set: + Multiply the number by the rule's base value and format the result. + + == SUBSTITUTION_TOKEN + :in all rule sets: + Format the number unchanged + + [] OPT_START, OPT_END + :in normal rule: + Omit the optional text if the number is an even + multiple of the rule's divisor + :in negative-number rule: + Not allowed. + :in improper-fraction rule: + Omit the optional text if the number is between 0 and 1 + (same as specifying both an x.x rule and a 0.x rule) + :in master rule: + Omit the optional text if the number is an integer + (same as specifying both an x.x rule and an x.0 rule) + !!! contradicts the above as it says the master rule is ignored + :in proper-fraction rule: + Not allowed. + :in rule in fraction rule set: + Omit the optional text if multiplying the number by the + rule's base value yields 1. + + $(cardinal,plural syntax)$ PLURAL_TOKEN + :in all rule sets: + This provides the ability to choose a word based on the + number divided by the radix to the power of the exponent + of the base value for the specified locale, which is + normally equivalent to the ←← value. This uses the cardinal + plural rules from PluralFormat. All strings used in the + plural format are treated as the same base value for parsing. + + $(ordinal,plural syntax)$ PLURAL_TOKEN + :in all rule sets: + This provides the ability to choose a word based on the + number divided by the radix to the power of the exponent + of the base value for the specified locale, which is + normally equivalent to the ←← value. This uses the ordinal + plural rules from PluralFormat. All strings used in the + plural format are treated as the same base value for parsing. + + INFINITY_RULE = 'Inf' + + NOT_A_NUMBER_RULE = 'NaN' + + SPECIAL_FRACTION_RULE = 'x,x' # there are other options but not existent in CLDR + """ + def __init__(self, name, private=False): + self.name = name + self.private = private + self.rules = [] + + + def apply(self, number, parent, fractional=False): + number = decimal.Decimal(str(number)) + # str is needed to avoid unecessary precision + # decimal is necessary for exact representation in fraction rules + + context = { + 'search_at': parent, + 'ruleset': self, + 'fractional': fractional, + 'omit_optional': False, # no default value is defined in the spec + SUBSTITUTION_TOKEN: number, + 'remainder_as_fractional': False # format remainder as fractional rule? + } + integral, remainder = divmod(number, 1) + + # fractional rule (ruleset in fractional processing) + # the value should always be between 0 and 1 + # not yet tested it needs clarification + if fractional: + index = self.get_rule_fractional(remainder) + if index is None: + raise RuleNotFound("rule for fractional processing of %s" % remainder) + rule = self.rules[index] + context[INTEGRAL_TOKEN] = rule.value * remainder # here remainder == number + context['omit_optional'] = rule.value * number == 1 + return rule.apply(number, context) + + # negative number rule + if number < 0: + rule = self.get_rule_special(NEGATIVE_NUMBER_RULE) + if rule is None: + raise RuleNotFound("negative number rule (%s)" % NEGATIVE_NUMBER_RULE) + context[REMAINDER_TOKEN] = abs(number) + return rule.apply(number, context) + + # master and fraction rules + if remainder != 0: + context[REMAINDER_TOKEN] = number - integral + context[INTEGRAL_TOKEN] = integral + context['remainder_as_fractional'] = True + + # search for master rule + rule = self.get_rule_special(MASTER_RULE, strict=True) + + # no master rule found + if rule is None: + if integral == 0: + rule = self.get_rule_special(PROPER_FRACTION_RULE) + if rule is None: + raise RuleNotFound("proper fraction rule (%s)" % PROPER_FRACTION_RULE) + + else: + rule = self.get_rule_special(IMPROPER_FRACTION_RULE) + if rule is None: + raise RuleNotFound("improper fraction rule (%s)" % IMPROPER_FRACTION_RULE) + context['omit_optional'] = 0 < number < 1 # between 0 and 1 + + return rule.apply(number, context) + + # normal rule + index = self.get_rule_integral(integral) + if index is None: + raise RuleNotFound("normal rule for %s" % integral) + rule = self.rules[index] + i, r = divmod(integral, rule.divisor) + context[REMAINDER_TOKEN] = r + context[INTEGRAL_TOKEN] = i + context[PREVIOUS_TOKEN] = index-1 # get rule using ruleset + context['omit_optional'] = r != 0 # only if not even multiple (TODO no need to store separatelly) + return rule.apply(number, context) + + + def get_rule_special(self, val, strict=False): + if val in Rule.specials: + for r in self.rules: + if r.value == val: + return r + + # return last rule if no match occured and strict is false + if not strict: + return self.rules[-1] + + + def get_rule_integral(self, val): + """ + Binary-search the rule list for the rule with the highest base value + less than or equal to the number. + + If that rule has two substitutions, + its base value is not an even multiple of its divisor, and the number + is an even multiple of the rule's divisor, use the rule that precedes + it in the rule list. Otherwise, use the rule itself. + """ + # automatically return last rule if no range matched + ret = len(self.rules)-1 + + for i in range(len(self.rules)-1): + if self.rules[i].value in Rule.specials: + continue + + if self.rules[i].value <= val < self.rules[i+1].value: + ret = i + break + + # need to have at least one normal rule? (otherwise ret could be None) + rule = self.rules[ret] + if rule.substitutions == 2 and \ + rule.value % rule.divisor == 0 and \ + val % rule.divisor == 0: + ret -= 1 + + return ret + + + def get_rule_fractional(self, val): + """If the rule set is a fraction rule set, do the following: + + Ignore negative-number and fraction rules. + + For each rule in the list, multiply the number being formatted (which + will always be between 0 and 1) by the rule's base value. Keep track + of the distance between the result and the nearest integer. + + Use the rule that produced the result closest to zero in the above + calculation. In the event of a tie or a direct hit, use the first + matching rule encountered. (The idea here is to try each rule's base + value as a possible denominator of a fraction. Whichever denominator + produces the fraction closest in value to the number being formatted + wins.) + + If the rule following the matching rule has the same base value, + use it if the numerator of the fraction is anything other than 1; if + the numerator is 1, use the original matching rule. (This is to allow + singular and plural forms of the rule text without a lot of extra hassle.) + + ??? what is considered the numerator of what fraction here + ??? is it rather not the closeset integer + """ + dists = [] + for i, rule in enumerate(self.rules): + if rule.value in Rule.specials or rule.value == 0: # ignore specials and 0 rules + continue + d = abs(round(val*rule.value) - val*rule.value) + dists.append((i, d)) + + # get the index of the closest 0 match + bst = min(dists, key=lambda x: x[1])[0] + + # there is a following rule + if len(self.rules) > bst+1 and \ + self.rules[bst].value == self.rules[bst+1].value and \ + val*self.rules[bst].value > 1: + bst += 1 + + return bst + + + def __repr__(self): + return 'Ruleset %s %s\n%s\n' % (self.name, self.private, '\n'.join(['\t'+str(r) for r in self.rules])) + + +class Rule(object): + """ + base value, a divisor, rule text, and zero, one, or two substitutions. + """ + specials = ( + NEGATIVE_NUMBER_RULE, IMPROPER_FRACTION_RULE, + PROPER_FRACTION_RULE, MASTER_RULE, INFINITY_RULE, + NOT_A_NUMBER_RULE, SPECIAL_FRACTION_RULE, + ) + + + def __init__(self, value, text, radix=None): + """ + divisor : iterator of literal, back_sub, fwd_sub, lit_exact elements parsed from rule + """ + if value in self.specials: + self.value = value + else: + try: + self.value = int(value) + except: + warnings.warn("Unknown rule value: [%s]" % value, SyntaxWarning) + + self.text = text + self._radix = radix + + self._parse(text) + + + def apply(self, number, context): + """ + """ + from .numbers import format_decimal + res = [] + for t in self.tokens: + if t.optional and not context['omit_optional']: + continue + + if t.type == TEXT_TOKEN: + res.append(t.reference) + + elif t.type in REFERENCE_TOKENS: + ref_type, ref = t.reference + ruleset = None + if ref_type == INTERNAL_REF: + ruleset = context['ruleset'] + elif ref_type in (PUBLIC_REF, PRIVATE_REF): # currently no distinction + ruleset = context['search_at'].get_ruleset(ref) + elif ref_type == DECIMAL_REF: + loc = context['search_at']._locale + x = numbers.format_decimal(number, format=ref, locale=loc) + res.append(x) + + if ruleset: + if t.type == REMAINDER_TOKEN and context['remainder_as_fractional']: + fractional = True + else: + fractional = context['fractional'] + res.append(ruleset.apply( + context[t.type], # number + context['search_at'], # parent + fractional, + )) + + elif t.type == PREVIOUS_TOKEN: + rule = context['ruleset'].rules[context[PREVIOUS_TOKEN]] + res.append(rule.apply( + context[REMAINDER_TOKEN], # number + context, # ??? + )) + + else: + raise ValueError('unknown token %s', t) + + + return ''.join(res) + + + @property + def divisor(self): + """it is highest exponent of radix less then or equal to the rules's base""" + if isinstance(self.value, int): + if self.value == 0: + return 1 + exp = decimal.Decimal(self.value).ln()/decimal.Decimal(self.radix).ln() + return int(self.radix**math.floor(exp)) + + + @property + def radix(self): + return self._radix or 10 + + + @property + def substitutions(self): + return len([t for t in self.tokens if t.type in REFERENCE_TOKENS]) + + + def _parse(self, text): + try: + self.tokens = [t for t in tokenize(text)] + except ValueError: + raise TokenizationError(self.text) + + + def __repr__(self): + return 'Rule %s (%s) - %s\n%s\n' % ( + self.value, self.text, + self.radix, + '\n'.join(['\t\t'+str(t) for t in self.tokens])) diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index 4188055a6..dcd607ed5 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -32,6 +32,7 @@ sys.path.insert(0, CHECKOUT_ROOT) from babel import dates, numbers +from babel import rbnf from babel._compat import pickle, text_type from babel.dates import split_interval_pattern from babel.localedata import Alias @@ -222,6 +223,7 @@ def parse_global(srcdir, sup): all_currencies = collections.defaultdict(set) currency_fractions = global_data.setdefault('currency_fractions', {}) territory_languages = global_data.setdefault('territory_languages', {}) + rbnf_locales = global_data.setdefault('rbnf_locales', []) bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml')) sup_windows_zones = parse(os.path.join(sup_dir, 'windowsZones.xml')) sup_metadata = parse(os.path.join(sup_dir, 'supplementalMetadata.xml')) @@ -325,6 +327,14 @@ def parse_global(srcdir, sup): 'official_status': language.attrib.get('officialStatus'), } territory_languages[territory.attrib['type']] = languages + + # To help the negotiation in `babel.numbers.spell_number` + # add all locales with rbnf rules to a list under `rbnf_locales` + filenames = os.listdir(os.path.join(srcdir, 'rbnf')) + filenames.remove('root.xml') + # TODO parse root.xml for global data (how to fall back?) + global_data['rbnf_locales'] = [os.path.splitext(f)[0] for f in filenames] + return global_data @@ -430,6 +440,13 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False): parse_character_order(data, tree) parse_measurement_systems(data, tree) + # there will be no rbnf rules for all locales + # there could be a separate iteration for rbnf rule files + rbnf_filename = os.path.join(srcdir, 'rbnf', filename) + if os.path.isfile(rbnf_filename): + rbnf_tree = parse(rbnf_filename) + parse_rbnf_rules(data, rbnf_tree) + write_datafile(data_filename, data, dump_json=dump_json) @@ -961,6 +978,36 @@ def parse_measurement_systems(data, tree): _import_type_text(measurement_systems, measurement_system, type=type) +def parse_rbnf_rules(data, tree): + """ + Parse rules based on: + http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting + """ + rbnf_rules = data.setdefault('rbnf_rules', {}) + + # ElementTree.dump(tree) + + for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'): + group_name = ruleset_grouping.attrib['type'] + rbnf_rules[group_name] = [] # TODO check for overwrite + for ruleset in ruleset_grouping.findall('ruleset'): + ruleset_name = ruleset.attrib['type'] + private = ruleset.attrib.get('access') == 'private' + ruleset_obj = rbnf.Ruleset(ruleset_name, private) + for rule in ruleset.findall('rbnfrule'): + radix = rule.attrib.get('radix') + try: + rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix) + except rbnf.TokenizationError as e: + log('%s: Unable to parse rule "%s%s: %s "' % ( + data['locale_id'], + rule.attrib['value'], + rule.text, + '' if radix is None else ('/%s' % radix), + )) + ruleset_obj.rules.append(rule_obj) + rbnf_rules[group_name].append(ruleset_obj) + if __name__ == '__main__': main() diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py new file mode 100644 index 000000000..d92073c3a --- /dev/null +++ b/tests/test_number_spelling.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +import unittest +import pytest + +from babel import numbers +from babel import rbnf +from babel.core import get_global +from babel.localedata import locale_identifiers + +soft_hyphen = '\xad' + +class TestRuleEngine(unittest.TestCase): + """ + Test everything related to the rules engine + """ + def test_basic(self): + x = rbnf.RuleBasedNumberFormat.negotiate('hu_HU') + assert str(x._locale) == 'hu' + assert 'spellout-numbering' in x.available_rulesets + + + def test_negotiation(self): + valid_ruleset_groups = ("SpelloutRules", "OrdinalRules", "NumberingSystemRules") + + for lid in locale_identifiers(): + loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale + if loc is None: + # generate warning if necessary + pass + else: + # test groups + for k in loc._data['rbnf_rules']: + assert k in valid_ruleset_groups + + + def test_tokenization(self): + + x = list(rbnf.tokenize("text[opt];")) + res = [ + rbnf.TokenInfo(type=1, reference='text', optional=False), + rbnf.TokenInfo(type=1, reference='opt', optional=True), + ] + assert x == res + + + def test_xml_parsing(self): + """ + all the rules should be able to go through the parser and tokenizer + made up some rules and run the tokenizer on them + + TODO + read data from all the locales that have rbnf_rules defined + all the raw rules should be in a specific structure based + on the XML specification + """ + assert True + + +class TestSpelling(unittest.TestCase): + """ + Locale specific tests + """ + def test_hu_HU_cardinal(self): + def _spell(x): + return numbers.spell_number(x, locale='hu_HU').replace(soft_hyphen, '') + + assert _spell(0) == "nulla" + assert _spell(1) == "egy" + assert _spell(2) == u"kettő" + assert _spell(3) == u"három" + assert _spell(10) == u"tíz" + assert _spell(20) == u"húsz" + # assert _spell('-0') == "mínusz nulla" + # assert _spell(123.25) == "százhuszonhárom egész huszonöt század" + assert _spell(-12) == u"mínusz tizenkettő" + # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilenc" + assert _spell(1950) == u"ezerkilencszázötven" + # only soft hyphens in the rules !!! + # assert _spell(2001) == "kétezer-egy" + # assert _spell('1999.2386') == "ezerkilencszázkilencvenkilenc egész kétezer-háromszáznyolcvanhat tízezred" + # assert _spell(-.199923862) == "mínusz nulla egész százkilencvenkilencezer-kilencszázhuszonnégy milliomod" + # assert _spell(-.199923862) == "kerekítve mínusz nulla egész ezerkilencszázkilencvenkilenc tízezred" + # assert _spell(.4326752) == "nulla egész negyvenhárom század" + + + def test_hu_HU_ordinal(self): + def _spell(x): + return numbers.spell_number(x, locale='hu_HU', ordinal=True).replace(soft_hyphen, '') + + assert _spell(0) == "nulla" + # assert _spell(0) == "nulladik" + assert _spell(1) == "első" + assert _spell(2) == "második" + assert _spell(3) == "harmadik" + assert _spell(10) == "tizedik" + assert _spell(20) == "huszadik" + assert _spell(30) == "harmincadik" + assert _spell(-12) == "mínusz tizenkettedik" + # assert _spell(23457829) == "huszonhárommilliónégyszázötvenhétezernyolcszázhuszonkilencedik" # wrong mutiple cldr errors + # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilencedik" + assert _spell(1100) == "ezerszázadik" + assert _spell(1950) == "ezerkilencszázötvenedik" + # assert _spell(2001) == "kétezer-egyedik" + + + def test_en_GB_cardinal(self): + def _spell(x): + return numbers.spell_number(x, locale='en_GB').replace(soft_hyphen, '') + + assert _spell(0) == "zero" + assert _spell(1) == "one" + assert _spell(2) == "two" + assert _spell(3) == "three" + # assert _spell('-0') == "minus zero" + # assert _spell(123.25) == "one hundred and twenty-three point twenty-five hundredths" + assert _spell(-12) == "minus twelve" + assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-nine" + # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-nine" + assert _spell(1950) == "one thousand nine hundred fifty" + # assert _spell(1950) == "one thousand nine hundred and fifty" + assert _spell(2001) == "two thousand one" + # assert _spell('1999.238') == "one thousand nine hundred and ninety-nine point two hundred and thirty-eight thousandths" + # assert _spell(-.199923862, precision=3, state_rounded=True) == "approximately minus zero point two tenths" + # assert _spell(-.1) == "minus zero point one tenth" # float to string conversion preserves precision + + + def test_en_GB_ordinal(self): + def _spell(x): + return numbers.spell_number(x, locale='en_GB', ordinal=True).replace(soft_hyphen, '') + + assert _spell(0) == "zeroth" + assert _spell(1) == "first" + assert _spell(2) == "second" + assert _spell(3) == "third" + assert _spell(4) == "fourth" + assert _spell(5) == "fifth" + assert _spell(6) == "sixth" + assert _spell(7) == "seventh" + assert _spell(8) == "eighth" + assert _spell(9) == "ninth" + assert _spell(10) == "tenth" + assert _spell(11) == "eleventh" + assert _spell(12) == "twelfth" + assert _spell(13) == "thirteenth" + assert _spell(20) == "twentieth" + assert _spell(30) == "thirtieth" + assert _spell(40) == "fortieth" + # assert _spell(40) == "fourtieth" + assert _spell(-12) == "minus twelfth" + # assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-ninth" # apostrophes + # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-ninth" + assert _spell(1950) == "one thousand nine hundred fiftieth" + # assert _spell(1950) == "one thousand nine hundred and fiftieth" + assert _spell(2001) == "two thousand first" + + + +# def test_hu_HU_error(): +# with pytest.raises(exceptions.TooBigToSpell) as excinfo: +# _spell(10**66, ordinal=True) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='hu_HU', precision=7) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752) + +# with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: +# _spell('1999.23862', ordinal=True) + +# def test_en_GB_error(): +# with pytest.raises(exceptions.TooBigToSpell) as excinfo: +# _spell(10**24, ordinal=True, locale='en_GB') + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='en_GB', precision=4) + +# with pytest.raises(exceptions.PrecisionError) as excinfo: +# _spell(.4326752, locale='en_GB') + +# with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo: +# _spell('1999.23', ordinal=True, locale='en_GB') + +