From 1b0d7ce3727ac9e1dd026d7dc8dd144878b8489e Mon Sep 17 00:00:00 2001
From: Szabolcs <szabolcs@birosign.com>
Date: Sun, 21 Jul 2019 20:41:03 +0200
Subject: [PATCH] Number spelling based on the CLDR's RBNF rules

A pure Python engine for parsing RBNF rules.
The rules are incomplete in many cases, fractional
number spelling is hardly supported.

Based on an earlier discussion:
https://github.com/python-babel/babel/pull/114
and referenced in
https://github.com/python-babel/babel/issues/179
---
 babel/numbers.py              |  21 +
 babel/rbnf.py                 | 713 ++++++++++++++++++++++++++++++++++
 scripts/import_cldr.py        |  47 +++
 tests/test_number_spelling.py | 187 +++++++++
 4 files changed, 968 insertions(+)
 create mode 100644 babel/rbnf.py
 create mode 100644 tests/test_number_spelling.py

diff --git a/babel/numbers.py b/babel/numbers.py
index 6888c9cb4..5e3ee1b16 100644
--- a/babel/numbers.py
+++ b/babel/numbers.py
@@ -24,6 +24,7 @@
 
 from babel.core import default_locale, Locale, get_global
 from babel._compat import decimal, string_types
+from babel.rbnf import RuleBasedNumberFormat
 
 try:
     # Python 2
@@ -640,6 +641,26 @@ def __init__(self, message, suggestions=None):
         self.suggestions = suggestions
 
 
+def spell_number(number, locale=LC_NUMERIC, **kwargs):
+    """Return value spelled out for a specific locale
+    
+    :param number: the number to format
+    :param locale: the `Locale` object or locale identifier
+    :param kwargs: optional locale specific parameters
+    """
+    speller = RuleBasedNumberFormat.negotiate(locale)
+    return speller.format(number, **kwargs)
+
+
+def get_rbnf_rules(locale=LC_NUMERIC):
+    """Return all the available public rules for a specific locale
+
+    :param locale: the `Locale` object or locale identifier
+    """
+    speller = RuleBasedNumberFormat.negotiate(locale)
+    return speller.available_rulesets
+
+
 def parse_number(string, locale=LC_NUMERIC):
     """Parse localized number string into an integer.
 
diff --git a/babel/rbnf.py b/babel/rbnf.py
new file mode 100644
index 000000000..2a5d24275
--- /dev/null
+++ b/babel/rbnf.py
@@ -0,0 +1,713 @@
+# -*- coding: utf-8 -*-
+"""
+babel.rbnf
+~~~~~~~~~~
+
+Locale dependent spelling of numbers.
+
+Documentation:
+-   http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
+-   http://www.icu-project.org/apiref/icu4c/classRuleBasedNumberFormat.html
+
+Examples
+-   http://userguide.icu-project.org/formatparse/numbers/rbnf-examples
+-   http://source.icu-project.org/repos/icu/trunk/icu4j/demos/src/com/ibm/icu/dev/demo/rbnf/RbnfSampleRuleSets.java
+
+    
+"""
+# Dev notes
+#
+# Reloading cldr:
+# python ./scripts/import_cldr.py ./cldr/cldr-core-35.1/common/ -f
+# 
+# Tokenization is inspired by Ka-Ping Yee's tokenize library
+
+# Undocumented syntax (←%rule-name←←)
+# Trac ticket filed for CLDR update PL rbnf
+#     http://unicode.org/cldr/trac/ticket/10544
+# Maybe the syntax need to be supported:
+#     http://bugs.icu-project.org/trac/ticket/13264
+# Original request for Hebrew (currently not used in Hebrew):
+#     http://bugs.icu-project.org/trac/ticket/4039
+
+from __future__ import unicode_literals
+
+import re
+import sys
+import math
+import decimal
+import collections
+import warnings
+
+from babel.core import Locale, get_global
+
+TEXT_TOKEN = 1
+INTEGRAL_TOKEN = 2
+REMAINDER_TOKEN = 3
+PREVIOUS_TOKEN = 4
+SUBSTITUTION_TOKEN = 5
+PLURAL_TOKEN = 6
+OPT_START = 7
+OPT_END = 8
+
+regex = [
+    (PLURAL_TOKEN,          r"\$\((.+)\)\$"),
+    (INTEGRAL_TOKEN,        r"←([^←[]*)←(←?)"),
+    (PREVIOUS_TOKEN,        r"→→→"),
+    (REMAINDER_TOKEN,       r"→([^→[]*)→"),
+    (SUBSTITUTION_TOKEN,    r"=([^=[]+)="),
+    (OPT_START,             r"\["),
+    (OPT_END,               r"\]"),
+    (TEXT_TOKEN,            r"[^[\]=→←]+"),
+]
+
+INTERNAL_REF = 1
+PRIVATE_REF = 2
+PUBLIC_REF = 3
+PLURAL_REF = 4
+DECIMAL_REF = 5
+
+REFERENCE_TOKENS = (INTEGRAL_TOKEN, REMAINDER_TOKEN, SUBSTITUTION_TOKEN)
+
+NEGATIVE_NUMBER_RULE = '-x'
+IMPROPER_FRACTION_RULE = 'x.x'
+PROPER_FRACTION_RULE = '0.x'
+MASTER_RULE = 'x.0'
+INFINITY_RULE = 'Inf'
+NOT_A_NUMBER_RULE = 'NaN'
+SPECIAL_FRACTION_RULE = 'x,x'  # there are other options but not existent in CLDR
+# locale.number_symbols['decimal']
+# normal rule means a number is specified
+
+
+class RBNFError(Exception): pass
+class TokenizationError(RBNFError): pass
+class RulesetNotFound(RBNFError): pass
+class RuleNotFound(RBNFError): pass
+
+TokenInfo = collections.namedtuple('TokenInfo', 'type reference optional')
+        
+# compile regex
+regex_comp = [(t, re.compile(r)) for t, r in regex]
+
+
+def tokenize(text):
+    """
+    Each rule has a list of tokens
+    
+    Text parsed by matching a list of regular expressions
+    against the beginning of the text. If the regex match
+    a token is generated and we continue with the rest of
+    the text.
+
+    Some of the tokens are optional if they are in squared
+    brackets. From regular expressions for the begining and
+    end of the optional section no tokens are generated.
+    Instead all the tokens inside the optional section are
+    flaged as optional.
+    
+    Some of them tokens are referencing other rulesets by name
+    this information is stored in the token along with the type
+    of reference.
+
+    """
+    # remove uneccesarry syntax (only used in the non-xml form)
+    if text.endswith(";"): text = text[:-1]
+    if text.startswith("'"): text = text[1:]
+
+    optional = False
+
+    while text:
+        stop = True
+        # print("TEXT: ", text)
+        for tok, regex in regex_comp:
+            # print(token, regex)
+            match = regex.match(text)
+            if match:
+                stop = False
+                text = text[match.end():]
+                if tok == OPT_START: optional = True
+                elif tok == OPT_END: optional = False
+                else:
+                    token = _gen_token(tok, match, optional)
+                    if token:
+                        yield token
+                break  # always start searching with the first regex
+        if stop:
+            raise ValueError(text)
+
+
+def _gen_token(tok, match, optional):
+    # remove this if CLCR is updated based on ticket
+    # http://unicode.org/cldr/trac/ticket/10544
+    if tok == INTEGRAL_TOKEN and match.group(2) == '←':
+        warnings.warn('Unsupported syntax ←...←←', SyntaxWarning)
+
+    if tok in REFERENCE_TOKENS:
+        reference = _parse_reference(match.group(1))
+        return TokenInfo(tok, reference, optional)
+
+    # currently only `en` has this
+    if tok == PLURAL_TOKEN:
+        return TokenInfo(tok, (PLURAL_REF, match.group(1)), optional)
+
+    if tok == PREVIOUS_TOKEN:
+        return TokenInfo(tok, None, optional)
+
+    if tok == TEXT_TOKEN:
+        return TokenInfo(tok, match.group(0), optional)
+
+
+def _parse_reference(string):
+    if string == "":
+        return INTERNAL_REF, ""
+    if string.startswith('%%'):
+        return PRIVATE_REF, string[2:]
+    if string.startswith('%'):
+        return PUBLIC_REF, string[1:]
+    if string[0] in '0#':
+        return DECIMAL_REF, string
+    warnings.warn('Reference parsing error: %s' % string, SyntaxWarning)
+    return INTERNAL_REF, ""  # defaults to this
+
+
+def untokenize_ICU():
+    """
+    TODO implement ICU style representation
+    rather make Ruleset.format_icu()
+    """
+
+
+class RuleBasedNumberFormat(object):
+    """
+    RuleBasedNumberFormat's behavior consists of one or more rule sets
+
+    The first ruleset in a locale is the default ruleset.
+    The substitution descriptor (i.e., the text between the token characters)
+    may take one of three forms:
+    :a rule set name:
+        Perform the mathematical operation on the number, and format the result
+        using the named rule set.
+    :a DecimalFormat pattern:
+        Perform the mathematical operation on the number, and format the
+        result using a DecimalFormat with the specified pattern. The
+        pattern must begin with 0 or #.
+    :nothing:
+        Perform the mathematical operation on the number, and format the
+        result using the rule set containing the current rule, except:
+        
+        -   You can't have an empty substitution descriptor with
+            a == substitution.
+        -   If you omit the substitution descriptor in a >> substitution
+            in a fraction rule, format the result one digit at a time
+            using the rule set containing the current rule.
+        -   If you omit the substitution descriptor in a << substitution
+            in a rule in a fraction rule set, format the result using
+            the default rule set for this formatter.
+    """
+    group_types = ('SpelloutRules', 'OrdinalRules', 'NumberingSystemRules')
+    # spell number should go for Spelloutrules
+    # make interface for the other two groups
+
+    def __init__(self, locale, group='SpelloutRules'):
+        self._locale = locale
+        self._group = group
+
+    @property
+    def rulesets(self):
+        return self._locale._data['rbnf_rules'][self._group]
+
+    @property
+    def available_rulesets(self):
+        """list available public rulesets"""
+        return [r.name for r in self.rulesets if not r.private]
+
+
+    def format(self, number, ordinal=False, year=False, ruleset=None, **kwargs):
+        """spell an actual number (int/float/decimal)
+        
+        Search available_rulesets for an entry point
+        default is `spellout-numbering`.
+
+        If year is True: use spellout-numbering-year
+        If ordinal is True: use spellout-ordinal
+        If year and ordinal both True: raise error
+        
+        TODO
+        If no `spellout-ordinal`:
+            if has `spellout-ordinal-*`: use first one, issue warning
+
+        """
+        if ordinal and year:
+            raise ValueError('both ordinal and year is not possible')
+        if ordinal:
+            search = ruleset or 'spellout-ordinal'
+        elif year:
+            search = ruleset or 'spellout-year'
+        else:
+            search = ruleset or 'spellout-numbering'
+
+        ruleset = self.get_ruleset(search)
+
+        if ruleset is None:
+            raise RulesetNotFound(search)
+
+        return ruleset.apply(number, self)
+
+
+    def get_ruleset(self, name):
+        for r in self.rulesets:
+            if r.name == name:
+                return r
+
+
+    @classmethod
+    def negotiate(cls, locale):
+        """
+        Negotiate proper RBNF rules based on global data item `rbnf_locales`
+        Caching is not necessary the Locale object does that pretty well
+        """
+        loc = Locale.negotiate([str(Locale.parse(locale))], get_global('rbnf_locales'))
+        return cls(loc)
+
+
+class Ruleset(object):
+    """
+    Each rule set consists of a name, a colon, and a list of rules.
+    (in the ICU syntax, CLDR differs because of XML)
+
+    If the rule's rule descriptor is left out, the base value is one plus the
+    preceding rule's base value (or zero if this is the first rule in the list)
+    in a normal rule set.  In a fraction rule set, the base value is the same as
+    the preceding rule's base value.
+
+    A rule set may be either a regular rule set or a fraction rule set, depending
+    on whether it is used to format a number's integral part (or the whole number)
+    or a number's fractional part. Using a rule set to format a rule's fractional
+    part makes it a fraction rule set.
+
+    Which rule is used to format a number is defined according to one of the
+    following algorithms:
+
+    REGULAR (NON-FRACTION) PROCESSING
+    ---------------------------------
+    If the rule set is a regular rule set, do the following:
+    
+    MASTER_RULE
+    If the rule set includes a master rule (and the number was passed in as a
+    double), use the master rule.  (If the number being formatted was passed
+    in as a long, the master rule is ignored.)
+    
+    NEGATIVE_NUMBER_RULE
+    If the number is negative, use the negative-number rule.
+    
+    IMPROPER_FRACTION_RULE
+    If the number has a fractional part and is greater than 1, use
+    the improper fraction rule.
+    
+    PROPER_FRACTION_RULE
+    If the number has a fractional part and is between 0 and 1, use
+    the proper fraction rule.
+
+    Binary-search the rule list for the rule with the highest base value
+    less than or equal to the number. If that rule has two substitutions,
+    its base value is not an even multiple of its divisor, and the number
+    is an even multiple of the rule's divisor, use the rule that precedes
+    it in the rule list. Otherwise, use the rule itself.
+    
+    FRACTION PROCESSING
+    -------------------
+    If the rule set is a fraction rule set, do the following:
+
+    Ignore negative-number and fraction rules.
+    
+    For each rule in the list, multiply the number being formatted (which
+    will always be between 0 and 1) by the rule's base value. Keep track
+    of the distance between the result and the nearest integer.
+    
+    Use the rule that produced the result closest to zero in the above
+    calculation. In the event of a tie or a direct hit, use the first
+    matching rule encountered. (The idea here is to try each rule's base
+    value as a possible denominator of a fraction. Whichever denominator
+    produces the fraction closest in value to the number being formatted
+    wins.)
+
+    If the rule following the matching rule has the same base value,
+    use it if the numerator of the fraction is anything other than 1; if
+    the numerator is 1, use the original matching rule. (This is to allow
+    singular and plural forms of the rule text without a lot of extra hassle.)
+
+    ----
+
+    A rule's body consists of a string of characters terminated by a semicolon.
+    The rule may include zero, one, or two substitution tokens, and a range of
+    text in brackets. The brackets denote optional text (and may also include
+    one or both substitutions). The exact meanings of the substitution tokens,
+    and under what conditions optional text is omitted, depend on the syntax
+    of the substitution token and the context. The rest of the text in a rule
+    body is literal text that is output when the rule matches the number
+    being formatted.
+
+    A substitution token begins and ends with a token character. The token
+    character and the context together specify a mathematical operation to
+    be performed on the number being formatted. An optional substitution
+    descriptor specifies how the value resulting from that operation is
+    used to fill in the substitution. The position of the substitution
+    token in the rule body specifies the location of the resultant text
+    in the original rule text.
+
+    The meanings of the substitution token characters are as follows:
+    
+    →→  REMAINDER_TOKEN
+        :in normal rule:
+            Divide the number by the rule's divisor and format the remainder
+        :in negative-number rule:
+            Find the absolute value of the number and format the result
+        :in fraction or master rule:
+            Isolate the number's fractional part and format it.
+        :in rule in fraction rule set:
+            Not allowed.
+    
+    →→→  PREVIOUS_TOKEN
+        :in normal rule:
+            Divide the number by the rule's divisor and format the
+            remainder, but bypass the normal rule-selection process
+            and just use the rule that precedes this one in this
+            rule list.
+        :in all other rules:
+            Not allowed.
+    
+    ←←  INTEGRAL_TOKEN
+        :in normal rule:
+            Divide the number by the rule's divisor and format the quotient
+        :in negative-number rule:
+            Not allowed.
+        :in fraction or master rule:
+            Isolate the number's integral part and format it.
+        :in rule in fraction rule set:
+            Multiply the number by the rule's base value and format the result.
+    
+    ==  SUBSTITUTION_TOKEN
+        :in all rule sets:
+            Format the number unchanged
+    
+    []  OPT_START, OPT_END
+        :in normal rule:
+            Omit the optional text if the number is an even
+            multiple of the rule's divisor
+        :in negative-number rule:
+            Not allowed.
+        :in improper-fraction rule:
+            Omit the optional text if the number is between 0 and 1
+            (same as specifying both an x.x rule and a 0.x rule)
+        :in master rule:
+            Omit the optional text if the number is an integer
+            (same as specifying both an x.x rule and an x.0 rule)
+            !!! contradicts the above as it says the master rule is ignored
+        :in proper-fraction rule:
+            Not allowed.
+        :in rule in fraction rule set:
+            Omit the optional text if multiplying the number by the
+            rule's base value yields 1.
+    
+    $(cardinal,plural syntax)$  PLURAL_TOKEN
+        :in all rule sets:
+            This provides the ability to choose a word based on the
+            number divided by the radix to the power of the exponent
+            of the base value for the specified locale, which is
+            normally equivalent to the ←← value. This uses the cardinal
+            plural rules from PluralFormat. All strings used in the
+            plural format are treated as the same base value for parsing.
+    
+    $(ordinal,plural syntax)$  PLURAL_TOKEN
+        :in all rule sets:
+            This provides the ability to choose a word based on the
+            number divided by the radix to the power of the exponent
+            of the base value for the specified locale, which is
+            normally equivalent to the ←← value. This uses the ordinal
+            plural rules from PluralFormat. All strings used in the
+            plural format are treated as the same base value for parsing.
+    
+    INFINITY_RULE = 'Inf'
+    
+    NOT_A_NUMBER_RULE = 'NaN'
+    
+    SPECIAL_FRACTION_RULE = 'x,x'  # there are other options but not existent in CLDR
+    """
+    def __init__(self, name, private=False):
+        self.name = name
+        self.private = private
+        self.rules = []
+
+
+    def apply(self, number, parent, fractional=False):
+        number = decimal.Decimal(str(number))
+        # str is needed to avoid unecessary precision
+        # decimal is necessary for exact representation in fraction rules
+
+        context = {
+            'search_at': parent,
+            'ruleset': self,
+            'fractional': fractional,
+            'omit_optional': False,  # no default value is defined in the spec
+            SUBSTITUTION_TOKEN: number,
+            'remainder_as_fractional': False  # format remainder as  fractional rule?
+        }
+        integral, remainder = divmod(number, 1)
+
+        # fractional rule (ruleset in fractional processing)
+        # the value should always be between 0 and 1
+        # not yet tested it needs clarification
+        if fractional:
+            index = self.get_rule_fractional(remainder)
+            if index is None:
+                raise RuleNotFound("rule for fractional processing of %s" % remainder)
+            rule = self.rules[index]
+            context[INTEGRAL_TOKEN] = rule.value * remainder  # here remainder == number
+            context['omit_optional'] = rule.value * number == 1
+            return rule.apply(number, context)
+
+        # negative number rule
+        if number < 0:
+            rule =  self.get_rule_special(NEGATIVE_NUMBER_RULE)
+            if rule is None:
+                raise RuleNotFound("negative number rule (%s)" % NEGATIVE_NUMBER_RULE)
+            context[REMAINDER_TOKEN] = abs(number)
+            return rule.apply(number, context)
+
+        # master and fraction rules
+        if remainder != 0:
+            context[REMAINDER_TOKEN] = number - integral
+            context[INTEGRAL_TOKEN] = integral
+            context['remainder_as_fractional'] = True
+
+            # search for master rule
+            rule = self.get_rule_special(MASTER_RULE, strict=True)
+
+            # no master rule found
+            if rule is None:
+                if integral == 0:
+                    rule = self.get_rule_special(PROPER_FRACTION_RULE)
+                    if rule is None:
+                        raise RuleNotFound("proper fraction rule (%s)" % PROPER_FRACTION_RULE)
+
+                else:
+                    rule = self.get_rule_special(IMPROPER_FRACTION_RULE)
+                    if rule is None:
+                        raise RuleNotFound("improper fraction rule (%s)" % IMPROPER_FRACTION_RULE)
+                    context['omit_optional'] = 0 < number < 1  # between 0 and 1
+
+            return rule.apply(number, context)
+
+        # normal rule
+        index = self.get_rule_integral(integral)
+        if index is None:
+            raise RuleNotFound("normal rule for %s" % integral)
+        rule = self.rules[index]
+        i, r = divmod(integral, rule.divisor)
+        context[REMAINDER_TOKEN] = r
+        context[INTEGRAL_TOKEN] = i
+        context[PREVIOUS_TOKEN] = index-1  # get rule using ruleset
+        context['omit_optional'] = r != 0  # only if not even multiple (TODO no need to store separatelly)
+        return rule.apply(number, context)
+
+
+    def get_rule_special(self, val, strict=False):
+        if val in Rule.specials:
+            for r in self.rules:
+                if r.value == val:
+                    return r
+        
+        # return last rule if no match occured and strict is false
+        if not strict:
+            return self.rules[-1]
+
+
+    def get_rule_integral(self, val):
+        """
+        Binary-search the rule list for the rule with the highest base value
+        less than or equal to the number.
+
+        If that rule has two substitutions,
+        its base value is not an even multiple of its divisor, and the number
+        is an even multiple of the rule's divisor, use the rule that precedes
+        it in the rule list. Otherwise, use the rule itself.
+        """
+        # automatically return last rule if no range matched
+        ret = len(self.rules)-1
+
+        for i in range(len(self.rules)-1):
+            if self.rules[i].value in Rule.specials:
+                continue
+            
+            if self.rules[i].value <= val < self.rules[i+1].value:
+                ret = i
+                break
+
+        # need to have at least one normal rule? (otherwise ret could be None)
+        rule = self.rules[ret]
+        if rule.substitutions == 2 and \
+                rule.value % rule.divisor == 0 and \
+                val % rule.divisor == 0:
+            ret -= 1
+
+        return ret
+
+
+    def get_rule_fractional(self, val):
+        """If the rule set is a fraction rule set, do the following:
+
+        Ignore negative-number and fraction rules.
+        
+        For each rule in the list, multiply the number being formatted (which
+        will always be between 0 and 1) by the rule's base value. Keep track
+        of the distance between the result and the nearest integer.
+        
+        Use the rule that produced the result closest to zero in the above
+        calculation. In the event of a tie or a direct hit, use the first
+        matching rule encountered. (The idea here is to try each rule's base
+        value as a possible denominator of a fraction. Whichever denominator
+        produces the fraction closest in value to the number being formatted
+        wins.)
+
+        If the rule following the matching rule has the same base value,
+        use it if the numerator of the fraction is anything other than 1; if
+        the numerator is 1, use the original matching rule. (This is to allow
+        singular and plural forms of the rule text without a lot of extra hassle.)
+
+        ??? what is considered the numerator of what fraction here
+        ??? is it rather not the closeset integer
+        """
+        dists = []
+        for i, rule in enumerate(self.rules):
+            if rule.value in Rule.specials or rule.value == 0:  # ignore specials and 0 rules
+                continue
+            d = abs(round(val*rule.value) - val*rule.value)
+            dists.append((i, d))
+
+        # get the index of the closest 0 match
+        bst = min(dists, key=lambda x: x[1])[0]
+
+        # there is a following rule
+        if len(self.rules) > bst+1 and \
+                self.rules[bst].value == self.rules[bst+1].value and \
+                val*self.rules[bst].value > 1:
+            bst += 1
+
+        return bst
+
+
+    def __repr__(self):
+        return 'Ruleset %s %s\n%s\n' % (self.name, self.private, '\n'.join(['\t'+str(r) for r in self.rules]))
+
+
+class Rule(object):
+    """
+    base value, a divisor, rule text, and zero, one, or two substitutions.
+    """
+    specials = (
+        NEGATIVE_NUMBER_RULE, IMPROPER_FRACTION_RULE,
+        PROPER_FRACTION_RULE, MASTER_RULE, INFINITY_RULE,
+        NOT_A_NUMBER_RULE, SPECIAL_FRACTION_RULE,
+    )
+
+
+    def __init__(self, value, text, radix=None):
+        """
+        divisor : iterator of literal, back_sub, fwd_sub, lit_exact elements parsed from rule 
+        """
+        if value in self.specials:
+            self.value = value
+        else:
+            try:
+                self.value = int(value)
+            except:
+                warnings.warn("Unknown rule value: [%s]" % value, SyntaxWarning)
+
+        self.text = text
+        self._radix = radix
+        
+        self._parse(text)
+
+
+    def apply(self, number, context):
+        """
+        """
+        from .numbers import format_decimal
+        res = []
+        for t in self.tokens:
+            if t.optional and not context['omit_optional']:
+                continue
+
+            if t.type == TEXT_TOKEN:
+                res.append(t.reference)
+
+            elif t.type in REFERENCE_TOKENS:
+                ref_type, ref = t.reference
+                ruleset = None
+                if ref_type == INTERNAL_REF:
+                    ruleset = context['ruleset']
+                elif ref_type in (PUBLIC_REF, PRIVATE_REF):  # currently no distinction
+                    ruleset = context['search_at'].get_ruleset(ref)
+                elif ref_type == DECIMAL_REF:
+                    loc = context['search_at']._locale
+                    x = numbers.format_decimal(number, format=ref, locale=loc)
+                    res.append(x)
+
+                if ruleset:
+                    if t.type == REMAINDER_TOKEN and context['remainder_as_fractional']:
+                        fractional = True
+                    else:
+                        fractional = context['fractional']
+                    res.append(ruleset.apply(
+                        context[t.type],  # number
+                        context['search_at'],  # parent
+                        fractional,
+                    ))
+
+            elif t.type == PREVIOUS_TOKEN:
+                rule = context['ruleset'].rules[context[PREVIOUS_TOKEN]]
+                res.append(rule.apply(
+                    context[REMAINDER_TOKEN],  # number
+                    context,  # ???
+                ))
+
+            else:
+                raise ValueError('unknown token %s', t)
+
+
+        return ''.join(res)
+
+
+    @property
+    def divisor(self):
+        """it is highest exponent of radix less then or equal to the rules's base"""
+        if isinstance(self.value, int):
+            if self.value == 0:
+                return 1
+            exp = decimal.Decimal(self.value).ln()/decimal.Decimal(self.radix).ln()
+            return int(self.radix**math.floor(exp))
+
+    
+    @property
+    def radix(self):
+        return self._radix or 10
+
+
+    @property
+    def substitutions(self):
+        return len([t for t in self.tokens if t.type in REFERENCE_TOKENS])
+
+
+    def _parse(self, text):
+        try:
+            self.tokens = [t for t in tokenize(text)]
+        except ValueError:
+            raise TokenizationError(self.text)
+
+
+    def __repr__(self):
+        return 'Rule %s (%s) - %s\n%s\n' % (
+            self.value, self.text,
+            self.radix,
+            '\n'.join(['\t\t'+str(t) for t in self.tokens]))
diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py
index 4188055a6..dcd607ed5 100755
--- a/scripts/import_cldr.py
+++ b/scripts/import_cldr.py
@@ -32,6 +32,7 @@
 sys.path.insert(0, CHECKOUT_ROOT)
 
 from babel import dates, numbers
+from babel import rbnf
 from babel._compat import pickle, text_type
 from babel.dates import split_interval_pattern
 from babel.localedata import Alias
@@ -222,6 +223,7 @@ def parse_global(srcdir, sup):
     all_currencies = collections.defaultdict(set)
     currency_fractions = global_data.setdefault('currency_fractions', {})
     territory_languages = global_data.setdefault('territory_languages', {})
+    rbnf_locales = global_data.setdefault('rbnf_locales', [])
     bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml'))
     sup_windows_zones = parse(os.path.join(sup_dir, 'windowsZones.xml'))
     sup_metadata = parse(os.path.join(sup_dir, 'supplementalMetadata.xml'))
@@ -325,6 +327,14 @@ def parse_global(srcdir, sup):
                 'official_status': language.attrib.get('officialStatus'),
             }
         territory_languages[territory.attrib['type']] = languages
+
+    # To help the negotiation in `babel.numbers.spell_number`
+    # add all locales with rbnf rules to a list under `rbnf_locales`
+    filenames = os.listdir(os.path.join(srcdir, 'rbnf'))
+    filenames.remove('root.xml')
+    # TODO parse root.xml for global data (how to fall back?)
+    global_data['rbnf_locales'] = [os.path.splitext(f)[0] for f in filenames]
+
     return global_data
 
 
@@ -430,6 +440,13 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
         parse_character_order(data, tree)
         parse_measurement_systems(data, tree)
 
+        # there will be no rbnf rules for all locales
+        # there could be a separate iteration for rbnf rule files
+        rbnf_filename = os.path.join(srcdir, 'rbnf', filename)
+        if os.path.isfile(rbnf_filename):
+            rbnf_tree = parse(rbnf_filename)
+            parse_rbnf_rules(data, rbnf_tree)
+
         write_datafile(data_filename, data, dump_json=dump_json)
 
 
@@ -961,6 +978,36 @@ def parse_measurement_systems(data, tree):
             _import_type_text(measurement_systems, measurement_system, type=type)
 
 
+def parse_rbnf_rules(data, tree):
+    """
+    Parse rules based on:
+    http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
+    """
+    rbnf_rules = data.setdefault('rbnf_rules', {})
+
+    # ElementTree.dump(tree)
+
+    for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
+        group_name = ruleset_grouping.attrib['type']
+        rbnf_rules[group_name] = []  # TODO check for overwrite
+        for ruleset in ruleset_grouping.findall('ruleset'):
+            ruleset_name = ruleset.attrib['type']
+            private = ruleset.attrib.get('access') == 'private'
+            ruleset_obj = rbnf.Ruleset(ruleset_name, private)
+            for rule in ruleset.findall('rbnfrule'):
+                radix = rule.attrib.get('radix')
+                try:
+                    rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix)
+                except rbnf.TokenizationError as e:
+                    log('%s: Unable to parse rule "%s%s: %s "' % (
+                        data['locale_id'],
+                        rule.attrib['value'],
+                        rule.text,
+                        '' if radix is None else ('/%s' % radix),
+                    ))
+                ruleset_obj.rules.append(rule_obj)
+            rbnf_rules[group_name].append(ruleset_obj)        
+
 
 if __name__ == '__main__':
     main()
diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py
new file mode 100644
index 000000000..d92073c3a
--- /dev/null
+++ b/tests/test_number_spelling.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+import unittest
+import pytest
+
+from babel import numbers
+from babel import rbnf
+from babel.core import get_global
+from babel.localedata import locale_identifiers
+
+soft_hyphen = '\xad'
+
+class TestRuleEngine(unittest.TestCase):
+    """
+    Test everything related to the rules engine
+    """
+    def test_basic(self):
+        x = rbnf.RuleBasedNumberFormat.negotiate('hu_HU')
+        assert str(x._locale) == 'hu'
+        assert 'spellout-numbering' in x.available_rulesets
+
+
+    def test_negotiation(self):
+        valid_ruleset_groups = ("SpelloutRules", "OrdinalRules", "NumberingSystemRules")
+        
+        for lid in locale_identifiers():
+            loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
+            if loc is None:
+                # generate warning if necessary
+                pass
+            else:
+                # test groups
+                for k in loc._data['rbnf_rules']:
+                    assert k in valid_ruleset_groups
+
+
+    def test_tokenization(self):
+
+        x = list(rbnf.tokenize("text[opt];"))
+        res = [
+            rbnf.TokenInfo(type=1, reference='text', optional=False),
+            rbnf.TokenInfo(type=1, reference='opt', optional=True),
+        ]
+        assert x == res
+
+
+    def test_xml_parsing(self):
+        """
+        all the rules should be able to go through the parser and tokenizer
+        made up some rules and run the tokenizer on them
+
+        TODO
+        read data from all the locales that have rbnf_rules defined
+        all the raw rules should be in a specific structure based
+        on the XML specification
+        """
+        assert True
+
+
+class TestSpelling(unittest.TestCase):
+    """
+    Locale specific tests
+    """
+    def test_hu_HU_cardinal(self):
+        def _spell(x):
+            return numbers.spell_number(x, locale='hu_HU').replace(soft_hyphen, '')
+
+        assert _spell(0) == "nulla"
+        assert _spell(1) == "egy"
+        assert _spell(2) == u"kettő"
+        assert _spell(3) == u"három"
+        assert _spell(10) == u"tíz"
+        assert _spell(20) == u"húsz"
+        # assert _spell('-0') == "mínusz nulla"
+        # assert _spell(123.25) == "százhuszonhárom egész huszonöt század"
+        assert _spell(-12) == u"mínusz tizenkettő"
+        # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilenc"
+        assert _spell(1950) == u"ezerkilencszázötven"
+        # only soft hyphens in the rules !!!
+        # assert _spell(2001) == "kétezer-egy"
+        # assert _spell('1999.2386') == "ezerkilencszázkilencvenkilenc egész kétezer-háromszáznyolcvanhat tízezred"
+        # assert _spell(-.199923862) == "mínusz nulla egész százkilencvenkilencezer-kilencszázhuszonnégy milliomod"
+        # assert _spell(-.199923862) == "kerekítve mínusz nulla egész ezerkilencszázkilencvenkilenc tízezred"
+        # assert _spell(.4326752) == "nulla egész negyvenhárom század"
+
+
+    def test_hu_HU_ordinal(self):
+        def _spell(x):
+            return numbers.spell_number(x, locale='hu_HU', ordinal=True).replace(soft_hyphen, '')
+
+        assert _spell(0) == "nulla"
+        # assert _spell(0) == "nulladik"
+        assert _spell(1) == "első"
+        assert _spell(2) == "második"
+        assert _spell(3) == "harmadik"
+        assert _spell(10) == "tizedik"
+        assert _spell(20) == "huszadik"
+        assert _spell(30) == "harmincadik"
+        assert _spell(-12) == "mínusz tizenkettedik"
+        # assert _spell(23457829) == "huszonhárommilliónégyszázötvenhétezernyolcszázhuszonkilencedik"  # wrong mutiple cldr errors
+        # assert _spell(23457829) == "huszonhárommillió-négyszázötvenhétezer-nyolcszázhuszonkilencedik"
+        assert _spell(1100) == "ezerszázadik"
+        assert _spell(1950) == "ezerkilencszázötvenedik"
+        # assert _spell(2001) == "kétezer-egyedik"
+
+
+    def test_en_GB_cardinal(self):
+        def _spell(x):
+            return numbers.spell_number(x, locale='en_GB').replace(soft_hyphen, '')
+
+        assert _spell(0) == "zero"
+        assert _spell(1) == "one"
+        assert _spell(2) == "two"
+        assert _spell(3) == "three"
+        # assert _spell('-0') == "minus zero"
+        # assert _spell(123.25) == "one hundred and twenty-three point twenty-five hundredths"
+        assert _spell(-12) == "minus twelve"
+        assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-nine"
+        # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-nine"
+        assert _spell(1950) == "one thousand nine hundred fifty"
+        # assert _spell(1950) == "one thousand nine hundred and fifty"
+        assert _spell(2001) == "two thousand one"
+        # assert _spell('1999.238') == "one thousand nine hundred and ninety-nine point two hundred and thirty-eight thousandths"
+        # assert _spell(-.199923862, precision=3, state_rounded=True) == "approximately minus zero point two tenths"
+        # assert _spell(-.1) == "minus zero point one tenth" # float to string conversion preserves precision
+
+
+    def test_en_GB_ordinal(self):
+        def _spell(x):
+            return numbers.spell_number(x, locale='en_GB', ordinal=True).replace(soft_hyphen, '')
+
+        assert _spell(0) == "zeroth"
+        assert _spell(1) == "first"
+        assert _spell(2) == "second"
+        assert _spell(3) == "third"
+        assert _spell(4) == "fourth"
+        assert _spell(5) == "fifth"
+        assert _spell(6) == "sixth"
+        assert _spell(7) == "seventh"
+        assert _spell(8) == "eighth"
+        assert _spell(9) == "ninth"
+        assert _spell(10) == "tenth"
+        assert _spell(11) == "eleventh"
+        assert _spell(12) == "twelfth"
+        assert _spell(13) == "thirteenth"
+        assert _spell(20) == "twentieth"
+        assert _spell(30) == "thirtieth"
+        assert _spell(40) == "fortieth"
+        # assert _spell(40) == "fourtieth"
+        assert _spell(-12) == "minus twelfth"
+        # assert _spell(23457829) == "twenty-three million four hundred fifty-seven thousand eight hundred twenty-ninth"  # apostrophes
+        # assert _spell(23457829) == "twenty-three million four hundred and fifty-seven thousand eight hundred and twenty-ninth"
+        assert _spell(1950) == "one thousand nine hundred fiftieth"
+        # assert _spell(1950) == "one thousand nine hundred and fiftieth"
+        assert _spell(2001) == "two thousand first"
+
+
+
+# def test_hu_HU_error():
+#     with pytest.raises(exceptions.TooBigToSpell) as excinfo:
+#         _spell(10**66, ordinal=True)
+
+#     with pytest.raises(exceptions.PrecisionError) as excinfo:
+#         _spell(.4326752, locale='hu_HU', precision=7)
+
+#     with pytest.raises(exceptions.PrecisionError) as excinfo:
+#         _spell(.4326752)
+
+#     with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo:
+#         _spell('1999.23862', ordinal=True)
+
+# def test_en_GB_error():
+#     with pytest.raises(exceptions.TooBigToSpell) as excinfo:
+#         _spell(10**24, ordinal=True, locale='en_GB')
+
+#     with pytest.raises(exceptions.PrecisionError) as excinfo:
+#         _spell(.4326752, locale='en_GB', precision=4)
+
+#     with pytest.raises(exceptions.PrecisionError) as excinfo:
+#         _spell(.4326752, locale='en_GB')
+
+#     with pytest.raises(exceptions.NoFractionOrdinalsAllowed) as excinfo:
+#         _spell('1999.23', ordinal=True, locale='en_GB')
+
+