From fa304a251a31c5f0efe20dee7997f521eb0b9e39 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 10:54:38 -0700 Subject: [PATCH 01/29] First draft of Korean Cardinal ITN Sparrowhawk testing is not done yet. Signed-off-by: hmlee245 --- .../inverse_normalize.py | 7 +- .../inverse_text_normalization/ko/__init__.py | 17 + .../ko/clean_eval_data.py | 361 ++++++++++++++++++ .../ko/data/__init__.py | 13 + .../ko/data/numbers/__init__.py | 13 + .../ko/data/numbers/digit.tsv | 9 + .../ko/data/numbers/thousands.tsv | 11 + .../ko/data/numbers/zero.tsv | 1 + .../ko/graph_utils.py | 292 ++++++++++++++ .../ko/taggers/__init__.py | 17 + .../ko/taggers/cardinal.py | 104 +++++ .../ko/taggers/tokenize_and_classify.py | 76 ++++ .../ko/taggers/word.py | 32 ++ .../inverse_text_normalization/ko/utils.py | 23 ++ .../ko/verbalizers/__init__.py | 17 + .../ko/verbalizers/cardinal.py | 54 +++ .../ko/verbalizers/verbalize.py | 36 ++ .../ko/verbalizers/verbalize_final.py | 49 +++ .../ko/verbalizers/word.py | 34 ++ .../run_evaluate.py | 2 +- tests/nemo_text_processing/ko/__init__.py | 13 + .../test_cases_cardinal.txt | 27 ++ .../nemo_text_processing/ko/test_cardinal.py | 39 ++ ..._sparrowhawk_inverse_text_normalization.sh | 34 ++ .../pynini_export.py | 8 + 25 files changed, 1287 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py create mode 100644 tests/nemo_text_processing/ko/__init__.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/ko/test_cardinal.py create mode 100644 tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..e505a8ad0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'ko': # Korean + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py new file mode 100644 index 000000000..3c1193333 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -0,0 +1,361 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from typing import List + +import regex as re + +from nemo_text_processing.text_normalization.data_loader_utils import ( + EOS_TYPE, + Instance, + load_files, + training_data_to_sentences, +) + +""" +This file is for evaluation purposes. +filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. +For example, normalized text should only include characters and whitespace characters but no punctuation. + Cardinal unnormalized instances should contain at least one integer and all other characters are removed. +""" + + +class Filter: + """ + Filter class + + Args: + class_type: semiotic class used in dataset + process_func: function to transform text + filter_func: function to filter text + + """ + + def __init__(self, class_type: str, process_func: object, filter_func: object): + self.class_type = class_type + self.process_func = process_func + self.filter_func = filter_func + + def filter(self, instance: Instance) -> bool: + """ + filter function + + Args: + filters given instance with filter function + + Returns: True if given instance fulfills criteria or does not belong to class type + """ + if instance.token_type != self.class_type: + return True + return self.filter_func(instance) + + def process(self, instance: Instance) -> Instance: + """ + process function + + Args: + processes given instance with process function + + Returns: processed instance if instance belongs to expected class type or original instance + """ + if instance.token_type != self.class_type: + return instance + return self.process_func(instance) + + +def filter_cardinal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_cardinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[^0-9]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_ordinal_1(instance: Instance) -> bool: + ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) + return ok + + +def process_ordinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[,\s]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_decimal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_decimal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_measure_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_measure_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"m2", "m²", un_normalized) + un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) + normalized = re.sub(r"[^a-z\s]", "", normalized) + normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_money_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_money_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"a\$", r"$", un_normalized) + un_normalized = re.sub(r"us\$", r"$", un_normalized) + un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) + un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_time_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_time_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r": ", ":", un_normalized) + un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) + un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_plain_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_plain_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_punct_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_punct_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_date_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_date_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_letters_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_letters_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_verbatim_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_verbatim_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_digit_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_digit_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_telephone_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_telephone_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_electronic_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_electronic_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_fraction_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_fraction_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_address_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_address_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +filters = [] +filters.append(Filter(class_type="CARDINAL", + process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", + process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", + process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", + process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", + process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", + process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", + process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", + process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", + process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", + process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", + process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", + process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", + process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", + process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", + process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", + process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, + process_func=lambda x: x, filter_func=lambda x: True)) + + +def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: + """ + Filters list of instances + + Args: + data: list of instances + + Returns: filtered and transformed list of instances + """ + updates_instances = [] + for instance in data: + updated_instance = False + for fil in filters: + if fil.class_type == instance.token_type and fil.filter(instance): + instance = fil.process(instance) + updated_instance = True + if updated_instance: + if verbose: + print(instance) + updates_instances.append(instance) + return updates_instances + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--input", help="input file path", + type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument( + "--verbose", help="print filtered instances", action='store_true') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + file_path = args.input + + print("Loading training data: " + file_path) + instance_list = load_files([file_path]) # List of instances + filtered_instance_list = filter_loaded_data(instance_list, args.verbose) + training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv new file mode 100644 index 000000000..9871cb9cf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv new file mode 100644 index 000000000..541752211 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv @@ -0,0 +1,11 @@ +억 +조 +경 +해 +자 +양 +구 +간 +정 +재 +극 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..43baac7c1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..7a9fd8720 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -0,0 +1,292 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_NARROW_NON_BREAK_SPACE = "\u202f" +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..df5804fc0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 마이너스 이십삼 -> cardinal { integer: "23" negative: "-" } } + + Args: + input_case: accepting Korean input. + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_zero = pynini.cross("영", "0") + + graph_negative = pynini.cross("마이너스", "-") + graph_negative += delete_space + + ten = pynutil.delete("십") + ten_alt = pynini.cross("십", "1") + ### Responsible for second digit of two digit number. ex) 20's 2 + graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) + ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, + graph_ten_component += graph_digit | pynutil.insert("0") + + hundred = pynutil.delete("백") + hundred_alt = pynini.cross("백", "1") + graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component += graph_ten_component + + thousand = pynutil.delete("천") + thousand_alt = pynini.cross("천", "1") + graph_thousand_component = pynini.union(((graph_digit + thousand) | thousand_alt), pynutil.insert("0")) + graph_thousand_component += graph_hundred_component + + tenthousand = pynutil.delete("만") + tenthousand_alt = pynini.cross("만", "1") + ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space + ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits + graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component += graph_thousand_component + + hundredmillion = pynutil.delete("억") + hundredmillion_alt = pynini.cross("억", "1") + graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) + graph_hundredmillion_component += graph_tenthousand_component + + trillion = pynutil.delete("조") + trillion_alt = pynini.cross("조", "1") + graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component += graph_hundredmillion_component + + tenquadrillion = pynutil.delete("경") + tenquadrillion_alt = pynini.cross("경", "1") + graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component += graph_trillion_component + + + graph = pynini.union( + ### From biggest unit to smallest, everything is included + graph_tenquadrillion_component| + graph_zero + ) + + leading_zero = ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + graph_nonzero = graph @ leading_zero + graph = pynini.union(graph_nonzero, graph_zero) + + graph = graph @ leading_zero | graph_zero + + self.just_cardinals = graph + + optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + + final_graph = ( + optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..760ce6829 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = INPUT_LOWER_CASED, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") + tagger = pynini.closure(token, 1) + + self.fst = tagger + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..0d6ccd5c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert( + "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py new file mode 100644 index 000000000..0222cc0b8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + + +def get_abs_path(rel_path): + + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..da950f35e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..1800a6dc8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "23" } -> -23 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + negative_sign = ( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + ) + + optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) + + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + integer_cardinal = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + digits_from_tag + + pynutil.delete("\"") + ) + + graph = integer_cardinal + final_graph = optional_sign_output + graph + self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..9d750d757 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + + graph = (cardinal_graph|word_graph) + self.fst = graph + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..8554fc161 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + else: + # token_graph = VerbalizeFst(deterministic=deterministic) + token_graph = VerbalizeFst().fst + token_verbalizer = ( + pynutil.delete("tokens {") + delete_space + token_graph + delete_space + pynutil.delete(" }") + ) + verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) + + self.fst = (verbalizer).optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..d79957ca8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + + +class WordFst(GraphFst): + ''' + tokens { name: "一" } -> 一 + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 0852329d6..7bfdd3399 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..007273e5e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,27 @@ +영~0 +구~9 +십~10 +십칠~17 +오십삼~53 +백~100 +백오~105 +삼백이십~320 +구백팔십칠~987 +천~1000 +천육~1006 +천오백~1500 +오천사백삼십이~5432 +만~10000 +만천이백~11200 +삼만오천칠백~35700 +십이만~120000 +백오십만삼천~1503000 +천만~10000000 +오천이백칠십만육천백~52706100 +억~100000000 +삼억오천만~350000000 +십이억천만~1210000000 +백오십억칠천만~15070000000 +오천억~500000000000 +일조~1000000000000 +이조오천억~2500000000000 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..9fd366ea6 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_ko = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) + assert pred == expected + + normalizer_with_audio_ko = ( + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + if RUN_AUDIO_BASED_TESTS + else None + ) \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..c44f4a703 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,34 @@ +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ko"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..0df099774 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko' ], type=str, default='en', @@ -307,6 +308,13 @@ def parse_args(): PostProcessingFst as TNPostProcessingFst, ) from nemo_text_processing.text_normalization.ja.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'rw': from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, From 77da79d12b1378502cc2b382cd6933b02e7c2545 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 18:46:22 +0000 Subject: [PATCH 02/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_normalize.py | 4 +- .../ko/clean_eval_data.py | 59 +++++++------------ .../ko/taggers/cardinal.py | 38 +++++++----- .../ko/taggers/tokenize_and_classify.py | 12 ++-- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 - .../ko/verbalizers/__init__.py | 2 +- .../ko/verbalizers/cardinal.py | 18 ++---- .../ko/verbalizers/verbalize.py | 7 +-- .../ko/verbalizers/verbalize_final.py | 3 +- .../ko/verbalizers/word.py | 1 - .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 6 +- .../pynini_export.py | 2 +- 14 files changed, 68 insertions(+), 92 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e505a8ad0..acda8b7f9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py index 3c1193333..bc429e858 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -282,41 +282,24 @@ def process_address_1(instance: Instance) -> Instance: filters = [] -filters.append(Filter(class_type="CARDINAL", - process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", - process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", - process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", - process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", - process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", - process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", - process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", - process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", - process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", - process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", - process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", - process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", - process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", - process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", - process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", - process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, - process_func=lambda x: x, filter_func=lambda x: True)) +filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: @@ -344,10 +327,8 @@ def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Inst def parse_args(): parser = ArgumentParser() - parser.add_argument("--input", help="input file path", - type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument( - "--verbose", help="print filtered instances", action='store_true') + parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument("--verbose", help="print filtered instances", action='store_true') return parser.parse_args() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..09cc03909 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -37,14 +38,14 @@ def __init__(self): graph_negative = pynini.cross("마이너스", "-") graph_negative += delete_space - + ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -59,29 +60,36 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component = pynini.union( + ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") + ) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) graph_tenquadrillion_component += graph_trillion_component - graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component| - graph_zero + graph_tenquadrillion_component + | graph_zero ) leading_zero = ( @@ -89,16 +97,18 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + optional_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..2842a4167 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,15 +19,15 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, delete_extra_space, delete_space, generator_main, ) +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst class ClassifyFst(GraphFst): @@ -64,8 +64,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) - + classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -73,4 +73,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0d6ccd5c5..0e4dbb93c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,6 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert( - "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index 0222cc0b8..d198c3835 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,9 +15,6 @@ import os - def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..f541211af 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -14,4 +14,4 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index 1800a6dc8..fb9a76d8e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): @@ -34,21 +30,17 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + digits_from_tag - + pynutil.delete("\"") + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 9d750d757..d8851e206 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,7 +30,6 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = (cardinal_graph|word_graph) + + graph = cardinal_graph | word_graph self.fst = graph - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..09b4cbc8b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,9 +18,9 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -28,6 +28,7 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..c134fe63a 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -20,7 +20,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space - class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 7bfdd3399..133474940 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..526747668 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -33,7 +33,5 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) if RUN_AUDIO_BASED_TESTS else None + ) diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0df099774..d1ba34a37 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko' + 'ko', ], type=str, default='en', From 9f7e876841b518a5b4d3d5e68df760cb7126729c Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Fri, 16 May 2025 13:10:40 -0700 Subject: [PATCH 03/29] fixing all the feedbacks Signed-off-by: hmlee245 --- .../ko/clean_eval_data.py | 361 ------------------ .../ko/data/numbers/zero.tsv | 1 - .../ko/graph_utils.py | 2 +- .../ko/taggers/__init__.py | 3 - .../ko/taggers/cardinal.py | 6 +- .../ko/taggers/tokenize_and_classify.py | 2 - .../ko/verbalizers/__init__.py | 4 - .../ko/verbalizers/verbalize_final.py | 1 - .../ko/verbalizers/word.py | 4 +- .../nemo_text_processing/ko/test_cardinal.py | 12 +- 10 files changed, 5 insertions(+), 391 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py deleted file mode 100644 index 3c1193333..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -import regex as re - -from nemo_text_processing.text_normalization.data_loader_utils import ( - EOS_TYPE, - Instance, - load_files, - training_data_to_sentences, -) - -""" -This file is for evaluation purposes. -filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. -For example, normalized text should only include characters and whitespace characters but no punctuation. - Cardinal unnormalized instances should contain at least one integer and all other characters are removed. -""" - - -class Filter: - """ - Filter class - - Args: - class_type: semiotic class used in dataset - process_func: function to transform text - filter_func: function to filter text - - """ - - def __init__(self, class_type: str, process_func: object, filter_func: object): - self.class_type = class_type - self.process_func = process_func - self.filter_func = filter_func - - def filter(self, instance: Instance) -> bool: - """ - filter function - - Args: - filters given instance with filter function - - Returns: True if given instance fulfills criteria or does not belong to class type - """ - if instance.token_type != self.class_type: - return True - return self.filter_func(instance) - - def process(self, instance: Instance) -> Instance: - """ - process function - - Args: - processes given instance with process function - - Returns: processed instance if instance belongs to expected class type or original instance - """ - if instance.token_type != self.class_type: - return instance - return self.process_func(instance) - - -def filter_cardinal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_cardinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[^0-9]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_ordinal_1(instance: Instance) -> bool: - ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) - return ok - - -def process_ordinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[,\s]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_decimal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_decimal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_measure_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_measure_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"m2", "m²", un_normalized) - un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) - normalized = re.sub(r"[^a-z\s]", "", normalized) - normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_money_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_money_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"a\$", r"$", un_normalized) - un_normalized = re.sub(r"us\$", r"$", un_normalized) - un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) - un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_time_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_time_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r": ", ":", un_normalized) - un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) - un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_plain_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_plain_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_punct_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_punct_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_date_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_date_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_letters_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_letters_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_verbatim_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_verbatim_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_digit_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_digit_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_telephone_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_telephone_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_electronic_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_electronic_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_fraction_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_fraction_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_address_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_address_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -filters = [] -filters.append(Filter(class_type="CARDINAL", - process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", - process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", - process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", - process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", - process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", - process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", - process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", - process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", - process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", - process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", - process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", - process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", - process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", - process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", - process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", - process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, - process_func=lambda x: x, filter_func=lambda x: True)) - - -def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: - """ - Filters list of instances - - Args: - data: list of instances - - Returns: filtered and transformed list of instances - """ - updates_instances = [] - for instance in data: - updated_instance = False - for fil in filters: - if fil.class_type == instance.token_type and fil.filter(instance): - instance = fil.process(instance) - updated_instance = True - if updated_instance: - if verbose: - print(instance) - updates_instances.append(instance) - return updates_instances - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", - type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument( - "--verbose", help="print filtered instances", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - - print("Loading training data: " + file_path) - instance_list = load_files([file_path]) # List of instances - filtered_instance_list = filter_loaded_data(instance_list, args.verbose) - training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv deleted file mode 100644 index 43baac7c1..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv +++ /dev/null @@ -1 +0,0 @@ -영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py index 7a9fd8720..50f1eb3b9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py index f541211af..f6e3c3795 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -12,6 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..7253019f0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -31,13 +31,9 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.cross("영", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_negative = pynini.cross("마이너스", "-") - graph_negative += delete_space - ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..bb6b35d41 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -24,8 +24,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, - delete_extra_space, - delete_space, generator_main, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -11,7 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..8d40d2804 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -19,7 +19,6 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..a423d5d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..872a5aa2a 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -16,10 +16,8 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestCardinal: @@ -30,10 +28,4 @@ class TestCardinal: @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) - assert pred == expected - - normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file + assert pred == expected \ No newline at end of file From 4df2965feae682f7762f3c6f292613339869a89b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 20:23:32 +0000 Subject: [PATCH 04/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/__init__.py | 1 - .../ko/taggers/tokenize_and_classify.py | 6 +----- .../ko/verbalizers/verbalize_final.py | 5 ++++- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py index f6e3c3795..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -11,4 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 30e0f5df4..75e3f6f20 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,11 +19,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, - generator_main, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 648285758..09c917d00 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -20,10 +20,13 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst + <<<<<<< HEAD -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main + ======= from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst + >>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 From 41ac59d791511cd82c03b242e8ec671c91360c6e Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Fri, 16 May 2025 13:36:00 -0700 Subject: [PATCH 05/29] This reverts commit f893d89bd8890e1b46df1e40054cc9176ac7ce7a, reversing changes made to 9f7e876841b518a5b4d3d5e68df760cb7126729c. Signed-off-by: hmlee245 --- .../inverse_normalize.py | 4 +- .../ko/taggers/cardinal.py | 42 ++++++------------- .../ko/taggers/tokenize_and_classify.py | 12 ++++-- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 ++ .../ko/verbalizers/__init__.py | 7 ---- .../ko/verbalizers/cardinal.py | 18 +++++--- .../ko/verbalizers/verbalize.py | 7 ++-- .../ko/verbalizers/verbalize_final.py | 11 +---- .../ko/verbalizers/word.py | 1 + .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 10 +---- .../pynini_export.py | 2 +- 13 files changed, 50 insertions(+), 72 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index acda8b7f9..e505a8ad0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index f3fa597e3..7253019f0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,7 +19,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path - class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -35,19 +34,13 @@ def __init__(self): graph_zero = pynini.cross("영", "0") graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) -<<<<<<< HEAD -======= - graph_negative = pynini.cross("마이너스", "-") - graph_negative += delete_space - ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -62,36 +55,29 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union( - ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") - ) + graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union( - ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") - ) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union( - ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") - ) + graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union( - ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") - ) + graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) graph_tenquadrillion_component += graph_trillion_component + graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component - | graph_zero + graph_tenquadrillion_component| + graph_zero ) leading_zero = ( @@ -99,18 +85,16 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure( - (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 - ) + optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 75e3f6f20..bb6b35d41 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,9 +19,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + generator_main, +) class ClassifyFst(GraphFst): @@ -58,8 +62,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) - + classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -67,4 +71,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0e4dbb93c..0d6ccd5c5 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,5 +27,6 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert( + "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index d198c3835..0222cc0b8 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,6 +15,9 @@ import os + def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index b8e634eef..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -11,10 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -<<<<<<< HEAD -======= - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index fb9a76d8e..1800a6dc8 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,7 +15,11 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) class CardinalFst(GraphFst): @@ -30,17 +34,21 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + digits_from_tag + + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() + self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index d8851e206..9d750d757 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,6 +30,7 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = cardinal_graph | word_graph + + graph = (cardinal_graph|word_graph) self.fst = graph + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 09c917d00..8d40d2804 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,16 +18,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst - -<<<<<<< HEAD -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main - -======= -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst - ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -35,7 +27,6 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ - def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index ecf62bfe3..a423d5d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -18,6 +18,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 133474940..7bfdd3399 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index ff5950f2a..872a5aa2a 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -28,12 +28,4 @@ class TestCardinal: @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) -<<<<<<< HEAD - assert pred == expected -======= - assert pred == expected - - normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) if RUN_AUDIO_BASED_TESTS else None - ) ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 + assert pred == expected \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index d1ba34a37..0df099774 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko', + 'ko' ], type=str, default='en', From a5164dc157fdfd6af8aeca449eb7875c80ba6aae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 20:55:36 +0000 Subject: [PATCH 06/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_normalize.py | 4 +-- .../ko/taggers/cardinal.py | 36 ++++++++++++------- .../ko/taggers/tokenize_and_classify.py | 12 +++---- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 -- .../ko/verbalizers/cardinal.py | 18 +++------- .../ko/verbalizers/verbalize.py | 7 ++-- .../ko/verbalizers/verbalize_final.py | 3 +- .../ko/verbalizers/word.py | 1 - .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 2 +- .../pynini_export.py | 2 +- 12 files changed, 43 insertions(+), 50 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e505a8ad0..acda8b7f9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 7253019f0..14172b4e9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -40,7 +41,7 @@ def __init__(self): graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -55,29 +56,36 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component = pynini.union( + ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") + ) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) graph_tenquadrillion_component += graph_trillion_component - graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component| - graph_zero + graph_tenquadrillion_component + | graph_zero ) leading_zero = ( @@ -85,16 +93,18 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + optional_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index bb6b35d41..75e3f6f20 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,13 +19,9 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, - generator_main, -) class ClassifyFst(GraphFst): @@ -62,8 +58,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) - + classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -71,4 +67,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0d6ccd5c5..0e4dbb93c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,6 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert( - "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index 0222cc0b8..d198c3835 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,9 +15,6 @@ import os - def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index 1800a6dc8..fb9a76d8e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): @@ -34,21 +30,17 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + digits_from_tag - + pynutil.delete("\"") + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 9d750d757..d8851e206 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,7 +30,6 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = (cardinal_graph|word_graph) + + graph = cardinal_graph | word_graph self.fst = graph - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8d40d2804..17f547740 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,8 +18,8 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -27,6 +27,7 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index a423d5d0c..ecf62bfe3 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -18,7 +18,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst - class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 7bfdd3399..133474940 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 872a5aa2a..f95d74107 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -28,4 +28,4 @@ class TestCardinal: @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0df099774..d1ba34a37 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko' + 'ko', ], type=str, default='en', From 7842d1324e32a40bd522b99eba726f962dafc742 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Fri, 23 May 2025 16:31:36 -0700 Subject: [PATCH 07/29] third draft of korean ITN work. Mainly fixing minor issues and adding test cases Signed-off-by: hmlee245 --- Jenkinsfile | 22 +++++++++++++++++++ .../ko/data/numbers/thousands.tsv | 11 ---------- .../ko/data/numbers/zero.tsv | 1 + .../ko/taggers/cardinal.py | 8 +++---- .../test_cases_cardinal.txt | 12 +++++++++- 5 files changed, 37 insertions(+), 17 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv diff --git a/Jenkinsfile b/Jenkinsfile index c94c107c6..32375f28f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,6 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -318,6 +319,22 @@ pipeline { } } } + stage('L0: Create KO ITN Grammars') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: KO ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } + } + } // L1 Tests starts here @@ -406,6 +423,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv deleted file mode 100644 index 541752211..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv +++ /dev/null @@ -1,11 +0,0 @@ -억 -조 -경 -해 -자 -양 -구 -간 -정 -재 -극 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..cbf967001 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 7253019f0..a1cf1012f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -31,7 +31,7 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - graph_zero = pynini.cross("영", "0") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ten = pynutil.delete("십") @@ -85,15 +85,13 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - - graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + negative_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) final_graph = ( - optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + negative_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt index 007273e5e..4f64116e5 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -24,4 +24,14 @@ 백오십억칠천만~15070000000 오천억~500000000000 일조~1000000000000 -이조오천억~2500000000000 \ No newline at end of file +이조오천억~2500000000000 +영영영~000 +영영백이십삼~00123 +만천~11000 +만천백십일~11111 +경~10000000000000000 +마이너스일~-1 +마이너스 일~-1 +- 일~-1 +마이너스일억사천이백칠십구만구천팔십이~-142799082 +마이너스 칠백삼십오~-735 \ No newline at end of file From b95f5fbb20f560e5592e7d52f0c9d1e3c3f124cd Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 10 Jun 2025 13:55:47 -0700 Subject: [PATCH 08/29] Commiting the first draft of Korean Ordinal ITN Signed-off-by: hmlee245 --- .../ko/data/ordinals/digit.tsv | 39 ++++++++++++++++ .../ko/taggers/cardinal.py | 5 +-- .../ko/taggers/ordinal.py | 45 +++++++++++++++++++ .../ko/taggers/tokenize_and_classify.py | 12 ++++- .../ko/verbalizers/ordinal.py | 36 +++++++++++++++ .../ko/verbalizers/verbalize.py | 10 ++++- .../test_cases_ordinal.txt | 19 ++++++++ tests/nemo_text_processing/ko/test_ordinal.py | 32 +++++++++++++ 8 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/ko/test_ordinal.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv new file mode 100644 index 000000000..532a4ed2e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv @@ -0,0 +1,39 @@ +첫 1 +두 2 +세 3 +네 4 +다섯 5 +여섯 6 +일곱 7 +여덟 8 +아홉 9 +열 10 +열한 11 +열두 12 +열세 13 +열네 14 +열다섯 15 +열여섯 16 +열일곱 17 +열여덟 18 +열아홉 19 +스무 20 +스물한 21 +스물두 22 +스물세 23 +스물네 24 +스물다섯 25 +스물여섯 26 +스물일곱 27 +스물여덟 28 +스물아홉 29 +서른 30 +서른한 31 +서른두 32 +서른세 33 +서른네 34 +서른다섯 35 +서른여섯 36 +서른일곱 37 +서른여덟 38 +서른아홉 39 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 1c78f6000..83b2b80d4 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -87,12 +87,11 @@ def __init__(self): graph_tenquadrillion_component | graph_zero ) - + leading_zero = ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) - graph_nonzero = graph @ leading_zero - graph = pynini.union(graph_nonzero, graph_zero) + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py new file mode 100644 index 000000000..2068c0894 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_CHAR +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal + Expressing integers in ordinal way for 1-39 and cardinal for 40+ due to Korean grammar. + e.g. 스물세번째 -> ordinal {integer: "23", 23번째} + e.g. 사십오번째 -> ordinal but the integer part is written in cardinal(due to korean grammar) + { integer: "45", 45번쨰} + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + cardinals = cardinal.just_cardinals + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) + ordinals = pynini.accep("째") | pynini.accep("번째") + + ordinal_graph = ( + pynutil.insert("integer: \"") + ((graph_digit + ordinals) | (cardinals + ordinals)) + pynutil.insert("\"") + ) + + final_graph = self.add_tokens(ordinal_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 75e3f6f20..78f6198d0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -22,6 +22,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst class ClassifyFst(GraphFst): @@ -55,10 +56,19 @@ def __init__( logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + word_graph = WordFst().fst - classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + + classify = (pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py new file mode 100644 index 000000000..b857a3be0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 스물세번째 -> ordinal {integer: "23", 23번째} + e.g. 사십오번째 -> ordinal but the integer part is written in cardinal(due to korean grammar) + { integer: "45", 45번쨰} + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + + integer_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + final_graph = self.delete_tokens(integer_component) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index d8851e206..6a3af3cf4 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -16,6 +16,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst class VerbalizeFst(GraphFst): @@ -29,7 +30,14 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst + + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst + word_graph = WordFst().fst - graph = cardinal_graph | word_graph + graph = (cardinal_graph + | word_graph + | ordinal_graph + ) self.fst = graph diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..2caad7dc3 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,19 @@ +영번째~0번째 +첫번째~1번째 +두번째~2번째 +세째~3째 +다섯째~5째 +아홉번째~9번째 +열번째~10번째 +열한번째~11번째 +열일곱째~17째 +스무번째~20번째 +스물두번째~22번째 +스물아홉째~29째 +서른번째~30번째 +서른째~30째 +사십번째~40번째 +사십째~40째 +오십번째~50번째 +오십삼번째~53번째 +백번째~100번째 diff --git a/tests/nemo_text_processing/ko/test_ordinal.py b/tests/nemo_text_processing/ko/test_ordinal.py new file mode 100644 index 000000000..b07c8bd55 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_ordinal.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected From 9a00ba65ccd3d6949d492dc8cfac9e9bbbda0e5f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:34:11 +0000 Subject: [PATCH 09/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/cardinal.py | 6 ++++-- .../inverse_text_normalization/ko/taggers/ordinal.py | 4 ++-- .../ko/taggers/tokenize_and_classify.py | 10 +++++----- .../ko/verbalizers/verbalize.py | 7 ++----- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 83b2b80d4..5987a9771 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -87,7 +87,7 @@ def __init__(self): graph_tenquadrillion_component | graph_zero ) - + leading_zero = ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) @@ -95,7 +95,9 @@ def __init__(self): self.just_cardinals = graph - negative_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + negative_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( negative_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index 2068c0894..62cc81203 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -17,7 +17,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_CHAR +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -42,4 +42,4 @@ def __init__(self, cardinal: GraphFst): ) final_graph = self.add_tokens(ordinal_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 7f8613506..df5f330f5 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -21,9 +21,8 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst - +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst class ClassifyFst(GraphFst): @@ -66,9 +65,10 @@ def __init__( word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1) - | pynutil.add_weight(ordinal_graph, 1.1) - | pynutil.add_weight(word_graph, 100) + classify = ( + pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(word_graph, 100) ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 6a3af3cf4..305f36dc9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -15,8 +15,8 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst class VerbalizeFst(GraphFst): @@ -36,8 +36,5 @@ def __init__(self): word_graph = WordFst().fst - graph = (cardinal_graph - | word_graph - | ordinal_graph - ) + graph = cardinal_graph | word_graph | ordinal_graph self.fst = graph From 63ce43df7a0a0d4aae5c10f9345e63ff98a040b9 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Mon, 16 Jun 2025 15:49:25 -0700 Subject: [PATCH 10/29] Update after first Korean Ordinal ITN pull request review Signed-off-by: hmlee245 --- .../ko/data/ordinals/cardinal_digit.tsv | 39 ++++++++++++++++++ .../ko/data/ordinals/digit.tsv | 34 +-------------- .../ko/data/ordinals/digit_no_one.tsv | 8 ++++ .../ko/taggers/ordinal.py | 41 +++++++++++++++++-- .../ko/verbalizers/verbalize.py | 4 -- .../ko/verbalizers/verbalize_final.py | 40 ++++++++---------- .../ko/verbalizers/word.py | 19 +++++---- .../test_cases_ordinal.txt | 10 ++--- .../nemo_text_processing/ko/test_cardinal.py | 2 +- ..._sparrowhawk_inverse_text_normalization.sh | 5 +++ 10 files changed, 126 insertions(+), 76 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv new file mode 100644 index 000000000..19e188ac6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv @@ -0,0 +1,39 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 +십 10 +십일 11 +십이 12 +십삼 13 +십사 14 +십오 15 +십육 16 +십칠 17 +십팔 18 +십구 19 +이십 20 +이십일 21 +이십이 22 +이십삼 23 +이십사 24 +이십오 25 +이십육 26 +이십칠 27 +이십팔 28 +이십구 29 +삼십 30 +삼십일 31 +삼십이 32 +삼십삼 33 +삼십사 34 +삼십오 35 +삼십육 36 +삼십칠 37 +삼십팔 38 +삼십구 39 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv index 532a4ed2e..d2fdd1846 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit.tsv @@ -1,4 +1,4 @@ -첫 1 +한 1 두 2 세 3 네 4 @@ -6,34 +6,4 @@ 여섯 6 일곱 7 여덟 8 -아홉 9 -열 10 -열한 11 -열두 12 -열세 13 -열네 14 -열다섯 15 -열여섯 16 -열일곱 17 -열여덟 18 -열아홉 19 -스무 20 -스물한 21 -스물두 22 -스물세 23 -스물네 24 -스물다섯 25 -스물여섯 26 -스물일곱 27 -스물여덟 28 -스물아홉 29 -서른 30 -서른한 31 -서른두 32 -서른세 33 -서른네 34 -서른다섯 35 -서른여섯 36 -서른일곱 37 -서른여덟 38 -서른아홉 39 \ No newline at end of file +아홉 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv new file mode 100644 index 000000000..00ab6d0b4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv @@ -0,0 +1,8 @@ +두 2 +세 3 +네 4 +다섯 5 +여섯 6 +일곱 7 +여덟 8 +아홉 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index 2068c0894..0bd3484e7 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -34,11 +34,46 @@ def __init__(self, cardinal: GraphFst): super().__init__(name="ordinal", kind="classify") cardinals = cardinal.just_cardinals - graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) - ordinals = pynini.accep("째") | pynini.accep("번째") + ordinals_suffix = pynini.accep("번째") #Korean ordinal's morphosyntactic feature + + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) #1-9 + graph_digit_no_one = pynini.string_file(get_abs_path("data/ordinals/digit_no_one.tsv")) #2-9 + cardinal_1to39 = pynini.string_file(get_abs_path("data/ordinals/cardinal_digit.tsv")) #1-39 in cardinals + + graph_tens_prefix = pynini.cross("열", "1") #First digit for tens + graph_twenties_prefix = pynini.cross("스물", "2") #First digit for twenties + graph_thirties_prefix = pynini.cross("서른", "3") #First digit for thirties + + graph_one = pynini.cross("첫", "1") + graph_single = graph_one | graph_digit_no_one + # 1 has a unique ordinal case in Korean and does not repeat for 11, 21, 31 + + graph_ten = pynini.cross("열", "10") + graph_tens = graph_ten | graph_tens_prefix + graph_digit + + graph_twenty = pynini.cross("스무", "20") + graph_twenties = graph_twenty | graph_twenties_prefix + graph_digit + + graph_thirty = pynini.cross("서른", "30") + graph_thirties = graph_thirty | graph_thirties_prefix + graph_digit + + ordinals = pynini.union( + graph_single, #1-9 + graph_tens, #10-19 + graph_twenties, #20-29 + graph_thirties #30-39 + ).optimize() + + cardinals_acceptor = pynini.project(cardinals, "input").optimize() #Input includes all cardinal expressions + cardinals_exception = pynini.project(cardinal_1to39, "input").optimize() #Input includes cardinal expression from 1 to 39 + + cardinal_plus_40 = pynini.difference(cardinals_acceptor,cardinals_exception).optimize() #All cardinal values - 1 to 39 cardinal values + cardinal_ordinal = cardinal_plus_40 @ cardinals + + ordinal_final = pynini.union(ordinals, cardinal_ordinal) # 1 to 39 in ordinal, everything else cardinal ordinal_graph = ( - pynutil.insert("integer: \"") + ((graph_digit + ordinals) | (cardinals + ordinals)) + pynutil.insert("\"") + pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") ) final_graph = self.add_tokens(ordinal_graph) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 6a3af3cf4..7baa749f3 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -15,7 +15,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst @@ -34,10 +33,7 @@ def __init__(self): ordinal = OrdinalFst() ordinal_graph = ordinal.fst - word_graph = WordFst().fst - graph = (cardinal_graph - | word_graph | ordinal_graph ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 17f547740..6ba917b35 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -13,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, delete_extra_space from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst class VerbalizeFinalFst(GraphFst): @@ -28,22 +27,19 @@ class VerbalizeFinalFst(GraphFst): tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ - def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): - super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) - far_file = None - if cache_dir is not None and cache_dir != "None": - os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_deterministic_verbalizer.far") - if not overwrite_cache and far_file and os.path.exists(far_file): - self.fst = pynini.Far(far_file, mode="r")["verbalize"] - else: - # token_graph = VerbalizeFst(deterministic=deterministic) - token_graph = VerbalizeFst().fst - token_verbalizer = ( - pynutil.delete("tokens {") + delete_space + token_graph + delete_space + pynutil.delete(" }") - ) - verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - - self.fst = (verbalizer).optimize() - if far_file: - generator_main(far_file, {"verbalize": self.fst}) + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index ecf62bfe3..29f8fb647 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -13,19 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space class WordFst(GraphFst): - ''' - tokens { name: "一" } -> 一 - ''' + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ - def __init__(self, deterministic: bool = True, lm: bool = False): - super().__init__(name="word", kind="verbalize", deterministic=deterministic) - - graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt index 2caad7dc3..08baa6c97 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt @@ -1,19 +1,17 @@ 영번째~0번째 첫번째~1번째 두번째~2번째 -세째~3째 -다섯째~5째 +세번째~3번째 +다섯번째~5번째 아홉번째~9번째 열번째~10번째 열한번째~11번째 -열일곱째~17째 +열일곱번째~17번째 스무번째~20번째 스물두번째~22번째 -스물아홉째~29째 +스물아홉번째~29번째 서른번째~30번째 -서른째~30째 사십번째~40번째 -사십째~40째 오십번째~50번째 오십삼번째~53번째 백번째~100번째 diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index f95d74107..96681fd8b 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index c44f4a703..5053be55d 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -27,6 +27,11 @@ testITNCardinal() { runtest $input } +testITNOrdinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + # Remove all command-line arguments shift $# From 473f0423c3940197c9ea11f38ee077293c8e447b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 16 Jun 2025 23:00:18 +0000 Subject: [PATCH 11/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/ordinal.py | 39 +++++++++---------- .../ko/verbalizers/verbalize.py | 4 +- .../ko/verbalizers/verbalize_final.py | 4 +- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index 085990f62..1ab598546 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -34,23 +34,23 @@ def __init__(self, cardinal: GraphFst): super().__init__(name="ordinal", kind="classify") cardinals = cardinal.just_cardinals - ordinals_suffix = pynini.accep("번째") #Korean ordinal's morphosyntactic feature + ordinals_suffix = pynini.accep("번째") # Korean ordinal's morphosyntactic feature - graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) #1-9 - graph_digit_no_one = pynini.string_file(get_abs_path("data/ordinals/digit_no_one.tsv")) #2-9 - cardinal_1to39 = pynini.string_file(get_abs_path("data/ordinals/cardinal_digit.tsv")) #1-39 in cardinals + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) # 1-9 + graph_digit_no_one = pynini.string_file(get_abs_path("data/ordinals/digit_no_one.tsv")) # 2-9 + cardinal_1to39 = pynini.string_file(get_abs_path("data/ordinals/cardinal_digit.tsv")) # 1-39 in cardinals - graph_tens_prefix = pynini.cross("열", "1") #First digit for tens - graph_twenties_prefix = pynini.cross("스물", "2") #First digit for twenties - graph_thirties_prefix = pynini.cross("서른", "3") #First digit for thirties + graph_tens_prefix = pynini.cross("열", "1") # First digit for tens + graph_twenties_prefix = pynini.cross("스물", "2") # First digit for twenties + graph_thirties_prefix = pynini.cross("서른", "3") # First digit for thirties graph_one = pynini.cross("첫", "1") - graph_single = graph_one | graph_digit_no_one + graph_single = graph_one | graph_digit_no_one # 1 has a unique ordinal case in Korean and does not repeat for 11, 21, 31 graph_ten = pynini.cross("열", "10") graph_tens = graph_ten | graph_tens_prefix + graph_digit - + graph_twenty = pynini.cross("스무", "20") graph_twenties = graph_twenty | graph_twenties_prefix + graph_digit @@ -58,23 +58,22 @@ def __init__(self, cardinal: GraphFst): graph_thirties = graph_thirty | graph_thirties_prefix + graph_digit ordinals = pynini.union( - graph_single, #1-9 - graph_tens, #10-19 - graph_twenties, #20-29 - graph_thirties #30-39 + graph_single, graph_tens, graph_twenties, graph_thirties # 1-9 # 10-19 # 20-29 # 30-39 ).optimize() - cardinals_acceptor = pynini.project(cardinals, "input").optimize() #Input includes all cardinal expressions - cardinals_exception = pynini.project(cardinal_1to39, "input").optimize() #Input includes cardinal expression from 1 to 39 + cardinals_acceptor = pynini.project(cardinals, "input").optimize() # Input includes all cardinal expressions + cardinals_exception = pynini.project( + cardinal_1to39, "input" + ).optimize() # Input includes cardinal expression from 1 to 39 - cardinal_plus_40 = pynini.difference(cardinals_acceptor,cardinals_exception).optimize() #All cardinal values - 1 to 39 cardinal values + cardinal_plus_40 = pynini.difference( + cardinals_acceptor, cardinals_exception + ).optimize() # All cardinal values - 1 to 39 cardinal values cardinal_ordinal = cardinal_plus_40 @ cardinals - ordinal_final = pynini.union(ordinals, cardinal_ordinal) # 1 to 39 in ordinal, everything else cardinal + ordinal_final = pynini.union(ordinals, cardinal_ordinal) # 1 to 39 in ordinal, everything else cardinal - ordinal_graph = ( - pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") - ) + ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") final_graph = self.add_tokens(ordinal_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index b5a3d6dc1..7a2bd341c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -34,7 +34,5 @@ def __init__(self): ordinal = OrdinalFst() ordinal_graph = ordinal.fst - graph = (cardinal_graph - | ordinal_graph - ) + graph = cardinal_graph | ordinal_graph self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 6ba917b35..6bcca5fb8 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, delete_extra_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -42,4 +42,4 @@ def __init__(self): + pynutil.delete("}") ) graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space - self.fst = graph \ No newline at end of file + self.fst = graph From 6d25ac95da8291864f00020d480e2cae323d2df7 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 24 Jun 2025 17:19:41 -0700 Subject: [PATCH 12/29] Deleting unnecessary data files and rules Signed-off-by: hmlee245 --- .../ko/data/ordinals/cardinal_digit.tsv | 39 ------------------- .../ko/data/ordinals/digit_no_one.tsv | 8 ---- .../ko/taggers/ordinal.py | 39 ++++++++++++++----- 3 files changed, 29 insertions(+), 57 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv deleted file mode 100644 index 19e188ac6..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/cardinal_digit.tsv +++ /dev/null @@ -1,39 +0,0 @@ -일 1 -이 2 -삼 3 -사 4 -오 5 -육 6 -칠 7 -팔 8 -구 9 -십 10 -십일 11 -십이 12 -십삼 13 -십사 14 -십오 15 -십육 16 -십칠 17 -십팔 18 -십구 19 -이십 20 -이십일 21 -이십이 22 -이십삼 23 -이십사 24 -이십오 25 -이십육 26 -이십칠 27 -이십팔 28 -이십구 29 -삼십 30 -삼십일 31 -삼십이 32 -삼십삼 33 -삼십사 34 -삼십오 35 -삼십육 36 -삼십칠 37 -삼십팔 38 -삼십구 39 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv deleted file mode 100644 index 00ab6d0b4..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/digit_no_one.tsv +++ /dev/null @@ -1,8 +0,0 @@ -두 2 -세 3 -네 4 -다섯 5 -여섯 6 -일곱 7 -여덟 8 -아홉 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index 085990f62..a8b6e9842 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -36,17 +36,22 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals ordinals_suffix = pynini.accep("번째") #Korean ordinal's morphosyntactic feature - graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) #1-9 - graph_digit_no_one = pynini.string_file(get_abs_path("data/ordinals/digit_no_one.tsv")) #2-9 - cardinal_1to39 = pynini.string_file(get_abs_path("data/ordinals/cardinal_digit.tsv")) #1-39 in cardinals + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) #1-9 in ordinals + cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) #1-9 in cardinals graph_tens_prefix = pynini.cross("열", "1") #First digit for tens graph_twenties_prefix = pynini.cross("스물", "2") #First digit for twenties graph_thirties_prefix = pynini.cross("서른", "3") #First digit for thirties - graph_one = pynini.cross("첫", "1") - graph_single = graph_one | graph_digit_no_one - # 1 has a unique ordinal case in Korean and does not repeat for 11, 21, 31 + graph_one = pynini.cross("한", "1") + single_digits = pynini.project(graph_digit, "input").optimize() + graph_one_acceptor = pynini.project(graph_one, "input").optimize() + two_to_nine = pynini.difference(single_digits,graph_one_acceptor).optimize() + graph_two_to_nine = two_to_nine @ graph_digit + graph_first = pynini.cross("첫", "1") + graph_single = graph_two_to_nine | graph_first + # Line 46-52 exclude regular 1 in ordinal and replace with a special 1. Like "first" in English + # The special 1 is a unique ordinal case for Korean and does not repeat for 11, 21, 31 graph_ten = pynini.cross("열", "10") graph_tens = graph_ten | graph_tens_prefix + graph_digit @@ -64,13 +69,27 @@ def __init__(self, cardinal: GraphFst): graph_thirties #30-39 ).optimize() + cardinal_10_to_19 = pynini.cross("십", "10") | (pynini.accep("십") + cardinal_digit) + + cardinal_20_to_29 = pynini.cross("이십", "20") | (pynini.accep("이십") + cardinal_digit) + + cardinal_30_to_39 = pynini.cross("삼십", "30") | (pynini.accep("삼십") + cardinal_digit) + + cardinal_below_40 = pynini.union( + cardinal_digit, + cardinal_10_to_19, + cardinal_20_to_29, + cardinal_30_to_39 + ).optimize() + # FST that include 1-39 in cardinal expression + cardinals_acceptor = pynini.project(cardinals, "input").optimize() #Input includes all cardinal expressions - cardinals_exception = pynini.project(cardinal_1to39, "input").optimize() #Input includes cardinal expression from 1 to 39 + cardinals_exception = pynini.project(cardinal_below_40, "input").optimize() #Input includes cardinal expression from 1 to 39 - cardinal_plus_40 = pynini.difference(cardinals_acceptor,cardinals_exception).optimize() #All cardinal values - 1 to 39 cardinal values - cardinal_ordinal = cardinal_plus_40 @ cardinals + cardinal_over_40 = pynini.difference(cardinals_acceptor,cardinals_exception).optimize() #All cardinal values except 1 to 39 cardinal values + cardinal_ordinal_suffix = cardinal_over_40 @ cardinals - ordinal_final = pynini.union(ordinals, cardinal_ordinal) # 1 to 39 in ordinal, everything else cardinal + ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) # 1 to 39 in ordinal, everything else cardinal ordinal_graph = ( pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") From 2c4574b6f58093f2a98e4da9c685d2467d60eada Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Jun 2025 00:26:20 +0000 Subject: [PATCH 13/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/ordinal.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index e62e51da6..f5171c3a5 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -36,8 +36,8 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals ordinals_suffix = pynini.accep("번째") # Korean ordinal's morphosyntactic feature - graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) #1-9 in ordinals - cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) #1-9 in cardinals + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) # 1-9 in ordinals + cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) # 1-9 in cardinals graph_tens_prefix = pynini.cross("열", "1") # First digit for tens graph_twenties_prefix = pynini.cross("스물", "2") # First digit for twenties @@ -46,7 +46,7 @@ def __init__(self, cardinal: GraphFst): graph_one = pynini.cross("한", "1") single_digits = pynini.project(graph_digit, "input").optimize() graph_one_acceptor = pynini.project(graph_one, "input").optimize() - two_to_nine = pynini.difference(single_digits,graph_one_acceptor).optimize() + two_to_nine = pynini.difference(single_digits, graph_one_acceptor).optimize() graph_two_to_nine = two_to_nine @ graph_digit graph_first = pynini.cross("첫", "1") graph_single = graph_two_to_nine | graph_first @@ -69,24 +69,25 @@ def __init__(self, cardinal: GraphFst): cardinal_10_to_19 = pynini.cross("십", "10") | (pynini.accep("십") + cardinal_digit) cardinal_20_to_29 = pynini.cross("이십", "20") | (pynini.accep("이십") + cardinal_digit) - + cardinal_30_to_39 = pynini.cross("삼십", "30") | (pynini.accep("삼십") + cardinal_digit) cardinal_below_40 = pynini.union( - cardinal_digit, - cardinal_10_to_19, - cardinal_20_to_29, - cardinal_30_to_39 + cardinal_digit, cardinal_10_to_19, cardinal_20_to_29, cardinal_30_to_39 ).optimize() # FST that include 1-39 in cardinal expression - cardinals_acceptor = pynini.project(cardinals, "input").optimize() #Input includes all cardinal expressions - cardinals_exception = pynini.project(cardinal_below_40, "input").optimize() #Input includes cardinal expression from 1 to 39 + cardinals_acceptor = pynini.project(cardinals, "input").optimize() # Input includes all cardinal expressions + cardinals_exception = pynini.project( + cardinal_below_40, "input" + ).optimize() # Input includes cardinal expression from 1 to 39 - cardinal_over_40 = pynini.difference(cardinals_acceptor,cardinals_exception).optimize() #All cardinal values except 1 to 39 cardinal values + cardinal_over_40 = pynini.difference( + cardinals_acceptor, cardinals_exception + ).optimize() # All cardinal values except 1 to 39 cardinal values cardinal_ordinal_suffix = cardinal_over_40 @ cardinals - ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) # 1 to 39 in ordinal, everything else cardinal + ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) # 1 to 39 in ordinal, everything else cardinal ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") From bc73bb78be5ea04284a3ee33b8eeeaeb7dccea1e Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Thu, 3 Jul 2025 11:02:18 -0700 Subject: [PATCH 14/29] Adding decimal to the PR Signed-off-by: hmlee245 --- .../ko/taggers/decimal.py | 69 +++++++++++++++++++ .../ko/taggers/tokenize_and_classify.py | 5 ++ .../ko/verbalizers/decimal.py | 59 ++++++++++++++++ .../ko/verbalizers/verbalize.py | 6 +- .../test_cases_decimal.txt | 14 ++++ tests/nemo_text_processing/ko/test_decimal.py | 32 +++++++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 ++ 7 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/ko/test_decimal.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py new file mode 100644 index 000000000..b2b82c32b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +def get_quantity(decimal): + suffix = pynini.union("만", "억", "조", "경") + numbers = decimal + res = numbers + pynutil.insert(' quantity: "') + suffix + pynutil.insert('"') + + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal + e.g. 일점오 -> decimnl { integer_part: "1" fractional_part: "5" } + e.g. 일점오만 -> decimal { integer_part: "1" fractional_part: "5" quantity: "만" } + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + cardinals = cardinal.just_cardinals + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + decimal_part = pynini.closure(graph_zero | graph_digit) + + decimal_point = pynutil.delete("점") + integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") + + graph_decimal_regular = integer_part + decimal_point + pynutil.insert(" ") + fractional_part #Regular decimal like 1.5 + graph_deicimal_larger = get_quantity(graph_decimal_regular) #If decimal is used to express big numbers like 15000 -> "1.5만" + + + self.decimal = graph_decimal_regular | graph_deicimal_larger + self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part + + graph_sign = ( + pynutil.insert("negative: \"") + (pynini.cross("마이너스", "-") | pynini.accep("-")) + pynutil.insert("\"") + ) + + final_graph = ( + (graph_sign + pynutil.insert(" ") + graph_decimal_regular) + | (graph_sign + pynutil.insert(" ") + graph_deicimal_larger) + | graph_decimal_regular + | graph_deicimal_larger + ) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index df5f330f5..3689d69e3 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -23,6 +23,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst class ClassifyFst(GraphFst): @@ -63,11 +64,15 @@ def __init__( ordinal = OrdinalFst(cardinal) ordinal_graph = ordinal.fst + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + word_graph = WordFst().fst classify = ( pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py new file mode 100644 index 000000000..f59bd6a10 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py @@ -0,0 +1,59 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal + e.g. decimal { integer_part: "1" fractional_part: "5" } -> 1.5 + e.g. decimal { integer_part: "1" fractional_part: "5" quantity: "만" } -> 1.5만 + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + + decimal_point = pynutil.insert(".") + integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + fractional_part = ( + pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + ) + quantity_part = pynutil.delete("quantity: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph_decimal = integer_part + decimal_point + pynutil.delete(" ") + fractional_part + graph_decimal_larger = ( + integer_part + + decimal_point + + pynutil.delete(" ") + + fractional_part + + pynutil.delete(" ") + + quantity_part + ) + + graph_sign = pynutil.delete("negative: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph = ( + graph_decimal + | graph_decimal_larger + | (graph_sign + pynutil.delete(" ") + graph_decimal) + | (graph_sign + pynutil.delete(" ") + graph_decimal_larger) + ) + + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 7a2bd341c..98dce0dc4 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -16,6 +16,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -34,5 +35,8 @@ def __init__(self): ordinal = OrdinalFst() ordinal_graph = ordinal.fst - graph = cardinal_graph | ordinal_graph + decimal = DecimalFst() + decimal_graph = decimal.fst + + graph = cardinal_graph | ordinal_graph | decimal_graph self.fst = graph diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..63d023168 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,14 @@ +일점삼~1.3 +영점오~0.5 +십점오~10.5 +이십삼점사~23.4 +백점일~100.1 +일점이삼~1.23 +영점오육칠~0.567 +구십구점구구~99.99 +일점영삼~1.03 +영점영영일~0.001 +천이백삼십사점오육~1234.56 +일점오만~1.5만 +일점오억~1.5억 +일점오경~1.5경 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_decimal.py b/tests/nemo_text_processing/ko/test_decimal.py new file mode 100644 index 000000000..733139df0 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_decimal.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index 5053be55d..7927877b8 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -32,6 +32,11 @@ testITNOrdinal() { runtest $input } +testITNDecimal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + # Remove all command-line arguments shift $# From 68a69069efdc7557d36cafbd7550523e533a26f1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Jul 2025 18:06:21 +0000 Subject: [PATCH 15/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/decimal.py | 9 ++++++--- .../ko/taggers/tokenize_and_classify.py | 2 +- .../ko/verbalizers/decimal.py | 11 ++--------- .../ko/verbalizers/verbalize.py | 2 +- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index b2b82c32b..98ba5cef1 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -47,9 +47,12 @@ def __init__(self, cardinal: GraphFst): integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") - graph_decimal_regular = integer_part + decimal_point + pynutil.insert(" ") + fractional_part #Regular decimal like 1.5 - graph_deicimal_larger = get_quantity(graph_decimal_regular) #If decimal is used to express big numbers like 15000 -> "1.5만" - + graph_decimal_regular = ( + integer_part + decimal_point + pynutil.insert(" ") + fractional_part + ) # Regular decimal like 1.5 + graph_deicimal_larger = get_quantity( + graph_decimal_regular + ) # If decimal is used to express big numbers like 15000 -> "1.5만" self.decimal = graph_decimal_regular | graph_deicimal_larger self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 3689d69e3..5753e4b66 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -21,9 +21,9 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py index f59bd6a10..65f225f45 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/decimal.py @@ -31,19 +31,12 @@ def __init__(self): decimal_point = pynutil.insert(".") integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - fractional_part = ( - pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - ) + fractional_part = pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") quantity_part = pynutil.delete("quantity: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") graph_decimal = integer_part + decimal_point + pynutil.delete(" ") + fractional_part graph_decimal_larger = ( - integer_part - + decimal_point - + pynutil.delete(" ") - + fractional_part - + pynutil.delete(" ") - + quantity_part + integer_part + decimal_point + pynutil.delete(" ") + fractional_part + pynutil.delete(" ") + quantity_part ) graph_sign = pynutil.delete("negative: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 98dce0dc4..5d5a01b3c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -15,8 +15,8 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst From 1e313354dda96a1da9c4f31f89da580ed87c172b Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Wed, 9 Jul 2025 16:25:12 -0700 Subject: [PATCH 16/29] Adding counter suffixes for Korean ordinal and its test cases Signed-off-by: hmlee245 --- .../ko/taggers/cardinal.py | 3 +-- .../ko/taggers/ordinal.py | 21 +++++++++++++++++-- .../ko/verbalizers/ordinal.py | 11 +++++++--- .../ko/verbalizers/word.py | 2 +- .../test_cases_ordinal.txt | 8 +++++++ 5 files changed, 37 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 5987a9771..c8202475d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -1,5 +1,4 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -91,7 +90,7 @@ def __init__(self): leading_zero = ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) ) - graph = graph @ leading_zero | graph_zero + graph = (graph @ leading_zero) | graph_zero self.just_cardinals = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index f5171c3a5..e59301d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -1,5 +1,4 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,6 +19,12 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path +def get_counter(ordinal): + suffix = pynini.union("개", "명", "병", "마리", "대", "송이", "포기", "사람", "자루", "채", "켤레", "그루", "벌", "잔", "장", "권", "살") + numbers = ordinal + res = numbers + pynutil.insert('" counter: "') + suffix + + return res class OrdinalFst(GraphFst): """ @@ -91,5 +96,17 @@ def __init__(self, cardinal: GraphFst): ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") - final_graph = self.add_tokens(ordinal_graph) + #Adding various counter suffix for ordinal + counters = pynini.union( + graph_digit, graph_tens, graph_twenties, graph_thirties + ).optimize() + # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" + + counter_final = (get_counter(counters) | get_counter(cardinal_ordinal_suffix)) + + counter_graph = pynutil.insert("integer: \"") + counter_final + pynutil.insert("\"") + + final_graph = (ordinal_graph | counter_graph) + + final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py index b857a3be0..13c5cb55f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class OrdinalFst(GraphFst): @@ -31,6 +31,11 @@ def __init__(self): super().__init__(name="ordinal", kind="verbalize") integer_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + counter_component = pynutil.delete("counter: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - final_graph = self.delete_tokens(integer_component) - self.fst = final_graph.optimize() + graph_with_counter = (integer_component + delete_space + counter_component) + + ordinal_verbalizer = pynini.union(graph_with_counter, integer_component) + + final_graph = self.delete_tokens(ordinal_verbalizer) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index 29f8fb647..6bdd7e55d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space class WordFst(GraphFst): diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt index 08baa6c97..8dfc77823 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt @@ -15,3 +15,11 @@ 오십번째~50번째 오십삼번째~53번째 백번째~100번째 +한개~1개 +한마리~1마리 +열병~10병 +스물한송이~21송이 +사십그루~40그루 +여섯사람~6사람 +열다섯장~15장 +서른일곱권~37권 \ No newline at end of file From 53dc07b5fa60b28f2c7c1f32ea6758252e21cb41 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Jul 2025 23:27:56 +0000 Subject: [PATCH 17/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/ordinal.py | 34 ++++++++++++++----- .../ko/verbalizers/ordinal.py | 4 +-- .../ko/verbalizers/word.py | 7 +++- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index e59301d0c..9c01bdfca 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -19,13 +19,33 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + def get_counter(ordinal): - suffix = pynini.union("개", "명", "병", "마리", "대", "송이", "포기", "사람", "자루", "채", "켤레", "그루", "벌", "잔", "장", "권", "살") + suffix = pynini.union( + "개", + "명", + "병", + "마리", + "대", + "송이", + "포기", + "사람", + "자루", + "채", + "켤레", + "그루", + "벌", + "잔", + "장", + "권", + "살", + ) numbers = ordinal - res = numbers + pynutil.insert('" counter: "') + suffix + res = numbers + pynutil.insert('" counter: "') + suffix return res + class OrdinalFst(GraphFst): """ Finite state transducer for classifying ordinal @@ -96,17 +116,15 @@ def __init__(self, cardinal: GraphFst): ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") - #Adding various counter suffix for ordinal - counters = pynini.union( - graph_digit, graph_tens, graph_twenties, graph_thirties - ).optimize() + # Adding various counter suffix for ordinal + counters = pynini.union(graph_digit, graph_tens, graph_twenties, graph_thirties).optimize() # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" - counter_final = (get_counter(counters) | get_counter(cardinal_ordinal_suffix)) + counter_final = get_counter(counters) | get_counter(cardinal_ordinal_suffix) counter_graph = pynutil.insert("integer: \"") + counter_final + pynutil.insert("\"") - final_graph = (ordinal_graph | counter_graph) + final_graph = ordinal_graph | counter_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py index 13c5cb55f..f8f106734 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/ordinal.py @@ -33,9 +33,9 @@ def __init__(self): integer_component = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") counter_component = pynutil.delete("counter: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - graph_with_counter = (integer_component + delete_space + counter_component) + graph_with_counter = integer_component + delete_space + counter_component ordinal_verbalizer = pynini.union(graph_with_counter, integer_component) final_graph = self.delete_tokens(ordinal_verbalizer) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index 6bdd7e55d..226b41e08 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -16,7 +16,12 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_CHAR, + NEMO_SIGMA, + GraphFst, + delete_space, +) class WordFst(GraphFst): From fcfc6c5fb586e35014615b881f1d60ec73f7061b Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Mon, 21 Jul 2025 16:53:09 -0700 Subject: [PATCH 18/29] Fixing minor comments error for newly added ordinal suffix Signed-off-by: hmlee245 --- .../ko/data/ordinals/counter_suffix.tsv | 17 +++++++++++++ .../ko/taggers/decimal.py | 9 +++---- .../ko/taggers/ordinal.py | 25 +++++++++++-------- 3 files changed, 35 insertions(+), 16 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv new file mode 100644 index 000000000..e240760ed --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/ordinals/counter_suffix.tsv @@ -0,0 +1,17 @@ +개 +명 +병 +마리 +대 +송이 +포기 +사람 +자루 +채 +켤레 +그루 +벌 +잔 +장 +권 +살 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index 98ba5cef1..b2b82c32b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -47,12 +47,9 @@ def __init__(self, cardinal: GraphFst): integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") - graph_decimal_regular = ( - integer_part + decimal_point + pynutil.insert(" ") + fractional_part - ) # Regular decimal like 1.5 - graph_deicimal_larger = get_quantity( - graph_decimal_regular - ) # If decimal is used to express big numbers like 15000 -> "1.5만" + graph_decimal_regular = integer_part + decimal_point + pynutil.insert(" ") + fractional_part #Regular decimal like 1.5 + graph_deicimal_larger = get_quantity(graph_decimal_regular) #If decimal is used to express big numbers like 15000 -> "1.5만" + self.decimal = graph_decimal_regular | graph_deicimal_larger self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index e59301d0c..799ddce7e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -20,7 +20,7 @@ from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path def get_counter(ordinal): - suffix = pynini.union("개", "명", "병", "마리", "대", "송이", "포기", "사람", "자루", "채", "켤레", "그루", "벌", "잔", "장", "권", "살") + suffix = pynini.string_file(get_abs_path("data/ordinals/counter_suffix.tsv")) numbers = ordinal res = numbers + pynutil.insert('" counter: "') + suffix @@ -48,6 +48,8 @@ def __init__(self, cardinal: GraphFst): graph_twenties_prefix = pynini.cross("스물", "2") # First digit for twenties graph_thirties_prefix = pynini.cross("서른", "3") # First digit for thirties + # Below exclude regular 1 in ordinal and replace with a special 1. Like "first" in English + # The special 1 is a unique ordinal case for Korean and does not repeat for 11, 21, 31 graph_one = pynini.cross("한", "1") single_digits = pynini.project(graph_digit, "input").optimize() graph_one_acceptor = pynini.project(graph_one, "input").optimize() @@ -55,8 +57,7 @@ def __init__(self, cardinal: GraphFst): graph_two_to_nine = two_to_nine @ graph_digit graph_first = pynini.cross("첫", "1") graph_single = graph_two_to_nine | graph_first - # Line 46-52 exclude regular 1 in ordinal and replace with a special 1. Like "first" in English - # The special 1 is a unique ordinal case for Korean and does not repeat for 11, 21, 31 + graph_ten = pynini.cross("열", "10") graph_tens = graph_ten | graph_tens_prefix + graph_digit @@ -77,31 +78,35 @@ def __init__(self, cardinal: GraphFst): cardinal_30_to_39 = pynini.cross("삼십", "30") | (pynini.accep("삼십") + cardinal_digit) + # FST that include 1-39 in cardinal expression cardinal_below_40 = pynini.union( cardinal_digit, cardinal_10_to_19, cardinal_20_to_29, cardinal_30_to_39 ).optimize() - # FST that include 1-39 in cardinal expression - cardinals_acceptor = pynini.project(cardinals, "input").optimize() # Input includes all cardinal expressions + # Input includes all cardinal expressions + cardinals_acceptor = pynini.project(cardinals, "input").optimize() + # Input includes cardinal expression from 1 to 39 cardinals_exception = pynini.project( cardinal_below_40, "input" - ).optimize() # Input includes cardinal expression from 1 to 39 + ).optimize() + # All cardinal values except 1 to 39 cardinal values cardinal_over_40 = pynini.difference( cardinals_acceptor, cardinals_exception - ).optimize() # All cardinal values except 1 to 39 cardinal values + ).optimize() cardinal_ordinal_suffix = cardinal_over_40 @ cardinals - ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) # 1 to 39 in ordinal, everything else cardinal + # 1 to 39 in ordinal, everything else cardinal + ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") #Adding various counter suffix for ordinal + # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" counters = pynini.union( graph_digit, graph_tens, graph_twenties, graph_thirties ).optimize() - # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" - + counter_final = (get_counter(counters) | get_counter(cardinal_ordinal_suffix)) counter_graph = pynutil.insert("integer: \"") + counter_final + pynutil.insert("\"") From 9fc941fe48e6216d6b9b561bc8efdf1cd1dc899f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Jul 2025 23:54:57 +0000 Subject: [PATCH 19/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/decimal.py | 9 +++++--- .../ko/taggers/ordinal.py | 21 +++++++------------ 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index b2b82c32b..98ba5cef1 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -47,9 +47,12 @@ def __init__(self, cardinal: GraphFst): integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") - graph_decimal_regular = integer_part + decimal_point + pynutil.insert(" ") + fractional_part #Regular decimal like 1.5 - graph_deicimal_larger = get_quantity(graph_decimal_regular) #If decimal is used to express big numbers like 15000 -> "1.5만" - + graph_decimal_regular = ( + integer_part + decimal_point + pynutil.insert(" ") + fractional_part + ) # Regular decimal like 1.5 + graph_deicimal_larger = get_quantity( + graph_decimal_regular + ) # If decimal is used to express big numbers like 15000 -> "1.5만" self.decimal = graph_decimal_regular | graph_deicimal_larger self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index aeaa1d9ee..88c576f59 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -60,7 +60,6 @@ def __init__(self, cardinal: GraphFst): graph_first = pynini.cross("첫", "1") graph_single = graph_two_to_nine | graph_first - graph_ten = pynini.cross("열", "10") graph_tens = graph_ten | graph_tens_prefix + graph_digit @@ -88,28 +87,22 @@ def __init__(self, cardinal: GraphFst): # Input includes all cardinal expressions cardinals_acceptor = pynini.project(cardinals, "input").optimize() # Input includes cardinal expression from 1 to 39 - cardinals_exception = pynini.project( - cardinal_below_40, "input" - ).optimize() + cardinals_exception = pynini.project(cardinal_below_40, "input").optimize() # All cardinal values except 1 to 39 cardinal values - cardinal_over_40 = pynini.difference( - cardinals_acceptor, cardinals_exception - ).optimize() + cardinal_over_40 = pynini.difference(cardinals_acceptor, cardinals_exception).optimize() cardinal_ordinal_suffix = cardinal_over_40 @ cardinals # 1 to 39 in ordinal, everything else cardinal - ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) + ordinal_final = pynini.union(ordinals, cardinal_ordinal_suffix) ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") - #Adding various counter suffix for ordinal + # Adding various counter suffix for ordinal # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" - counters = pynini.union( - graph_digit, graph_tens, graph_twenties, graph_thirties - ).optimize() - - counter_final = (get_counter(counters) | get_counter(cardinal_ordinal_suffix)) + counters = pynini.union(graph_digit, graph_tens, graph_twenties, graph_thirties).optimize() + + counter_final = get_counter(counters) | get_counter(cardinal_ordinal_suffix) counter_graph = pynutil.insert("integer: \"") + counter_final + pynutil.insert("\"") From 688c84f7a9c7f2ea5e8a445e133104b56821e7e1 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Thu, 31 Jul 2025 11:02:08 -0700 Subject: [PATCH 20/29] Adding Korean fraction ITN to the codes and raising a new PR Signed-off-by: hmlee245 --- .../ko/taggers/decimal.py | 6 +- .../ko/taggers/fraction.py | 95 +++++++++++++++++++ .../ko/taggers/ordinal.py | 2 +- .../ko/taggers/tokenize_and_classify.py | 6 +- .../ko/verbalizers/fraction.py | 65 +++++++++++++ .../ko/verbalizers/verbalize.py | 6 +- .../test_cases_fraction.txt | 29 ++++++ .../nemo_text_processing/ko/test_fraction.py | 32 +++++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 + 9 files changed, 240 insertions(+), 6 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/ko/test_fraction.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index b2b82c32b..40ca9a3e0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_SPACE from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -47,8 +47,8 @@ def __init__(self, cardinal: GraphFst): integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") - graph_decimal_regular = integer_part + decimal_point + pynutil.insert(" ") + fractional_part #Regular decimal like 1.5 - graph_deicimal_larger = get_quantity(graph_decimal_regular) #If decimal is used to express big numbers like 15000 -> "1.5만" + graph_decimal_regular = integer_part + decimal_point + pynutil.insert(NEMO_SPACE) + fractional_part # Regular decimal like 1.5 + graph_deicimal_larger = get_quantity(graph_decimal_regular) # If decimal is used to express big numbers like 15000 -> "1.5만" self.decimal = graph_decimal_regular | graph_deicimal_larger diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py new file mode 100644 index 000000000..faf522133 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_SPACE + + +class FractionFst(GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + """ + Fitite state transducer for classifying fractions + e.g., + fraction { denominator: "사" numerator: "삼" } -> 3/4 + fraction { mixed number: "일" denominator: "사" numerator: "삼" } -> 1 3/4 + fraction { denominator: "루트삼" numerator: "일" } -> 1/√3 + fraction { denominator: "일점육오" numerator: "오십" } -> 50/1.65 + fraction { denominator: "이루트육" numerator: "삼" } -> 3/2√6 + """ + super().__init__(name="fraction", kind="classify") + + cardinal = cardinal.just_cardinals + decimal = decimal.just_decimal + + # Expression between fraction. Means the dash "/" + fraction_word = pynutil.delete("분의") + # Expression combining mixed number and fraction. Optional to use + connecting_word = pynutil.delete("와") | pynutil.delete("과") + # Expression for "√" + root_word = pynini.accep("√") | pynini.cross("루트", "√") + + graph_sign = ( + pynutil.insert("negative: \"") + (pynini.accep("-") | pynini.cross("마이너스", "-")) + pynutil.insert("\"") + ) + + # graph_mixed_number considers all of possible combination number you can have in front of fraction + graph_mixed_number = ( + pynutil.insert("integer_part: \"") + + ( + decimal | (decimal + connecting_word) | (root_word + decimal) | (cardinal + root_word + decimal) + | (root_word + decimal + connecting_word) | (cardinal + root_word + decimal + connecting_word) + | cardinal | (cardinal + connecting_word) | (root_word + cardinal) | (cardinal + root_word + cardinal) + | (root_word + cardinal + connecting_word) | (cardinal + root_word + cardinal + connecting_word) + + ) + + pynutil.insert("\"") + ) + + graph_denominator = ( + pynutil.insert("denominator: \"") + + ( + (decimal | (cardinal + root_word + decimal) | (root_word + decimal) + | cardinal | (cardinal + root_word + cardinal) | (root_word + cardinal)) + + pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + ) + + pynutil.insert("\"") + ) + + graph_numerator = ( + pynutil.insert("numerator: \"") + + ( + (decimal | (cardinal + root_word + decimal) | (root_word + decimal) + | cardinal | (cardinal + root_word + cardinal) | (root_word + cardinal)) + + pynini.closure(pynutil.delete(NEMO_SPACE)) + ) + + pynutil.insert("\"") + ) + + graph_fraction_sign = (graph_sign + pynutil.insert(NEMO_SPACE) + graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator) + graph_fraction_no_sign = (graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator) + # Only fraction like "1/3" or "- 1/3" + graph_fractions = (graph_fraction_sign | graph_fraction_no_sign) + # Mixed number fraction like "2 1/3" or "-2 1/3" + graph_mixed_number_fraction = ( + pynini.closure((graph_sign + pynutil.insert(" ")), 0, 1) + pynutil.add_weight(graph_mixed_number, 1.1) + + pynutil.insert(NEMO_SPACE) + graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator + ) + + final_graph = graph_fractions | graph_mixed_number_fraction + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py index aeaa1d9ee..0207cb38d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/ordinal.py @@ -103,7 +103,7 @@ def __init__(self, cardinal: GraphFst): ordinal_graph = pynutil.insert("integer: \"") + ((ordinal_final + ordinals_suffix)) + pynutil.insert("\"") - #Adding various counter suffix for ordinal + # Adding various counter suffix for ordinal # For counting, Korean does not use the speical "첫" for 1. Instead the regular "한" counters = pynini.union( graph_digit, graph_tens, graph_twenties, graph_thirties diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 5753e4b66..10bd1791f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -1,5 +1,4 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,6 +22,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -67,12 +67,16 @@ def __init__( decimal = DecimalFst(cardinal) decimal_graph = decimal.fst + fraction = FractionFst(cardinal, decimal) + fraction_graph = fraction.fst + word_graph = WordFst().fst classify = ( pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.0) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py new file mode 100644 index 000000000..4e04f65e3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py @@ -0,0 +1,65 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NON_BREAKING_SPACE, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, +) + + +class FractionFst(GraphFst): + def __init__(self): + """ + Fitite state transducer for classifying fractions + e.g., + fraction { denominator: "사" numerator: "삼" } -> 3/4 + fraction { integer_part: "일" denominator: "사" numerator: "삼" } -> 1 3/4 + fraction { denominator: "루트삼" numerator: "일" } -> 1/√3 + fraction { denominator: "일점육오" numerator: "오십" } -> 50/1.65 + fraction { denominator: "이루트육" numerator: "삼" } -> 3/2√6 + """ + super().__init__(name="fraction", kind="verbalize") + + sign_component = pynutil.delete("negative: \"") + pynini.closure("-", 1) + pynutil.delete("\"") + + mixed_number_component = ( + pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + denominator_component = ( + pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + numerator_component = ( + pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + regular_graph = ( + pynini.closure((sign_component + pynutil.delete(NEMO_SPACE)), 0, 1) + + pynini.closure(mixed_number_component + pynutil.delete(NEMO_SPACE) + pynutil.insert(NEMO_NON_BREAKING_SPACE)) + + numerator_component + + pynutil.delete(NEMO_SPACE) + + pynutil.insert("/") + + denominator_component + ) + + final_graph = self.delete_tokens(regular_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 5d5a01b3c..567cdc695 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -17,6 +17,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -38,5 +39,8 @@ def __init__(self): decimal = DecimalFst() decimal_graph = decimal.fst - graph = cardinal_graph | ordinal_graph | decimal_graph + fraction = FractionFst() + fraction_graph = fraction.fst + + graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph self.fst = graph diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..c5fda707d --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,29 @@ +이분의일~1/2 +사분의일~1/4 +사분의삼~3/4 +오분의이~2/5 +십분의칠~7/10 +십이분의오~5/12 +이십삼분의십~10/23 +백분의일~1/100 +백분의구십구~99/100 +천분의백이십삼~123/1000 +일과이분의일~1 1/2 +삼과사분의일~3 1/4 +오와팔분의삼~5 3/8 +십과백분의칠십오~10 75/100 +마이너스사분의일~-1/4 +영점오분의일~1/0.5 +삼분의일점오~1.5/3 +루트사분의일~1/√4 +구분의루트십육~√16/9 +이와루트구분의일~2 1/√9 +마이너스오분의루트이십오~-√25/5 +칠분의육~6/7 +백오십분의이십~20/150 +사와오분의이~4 2/5 +이십과백분의일~20 1/100 +일점오분의영점이~0.2/1.5 +루트백분의십~10/√100 +십과루트팔십일분의삼~10 3/√81 +마이너스이와십분의일~-2 1/10 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_fraction.py b/tests/nemo_text_processing/ko/test_fraction.py new file mode 100644 index 000000000..bb3a889e3 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_fraction.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index 7927877b8..a63c08f84 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -37,6 +37,11 @@ testITNDecimal() { runtest $input } +testITNFraction() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + # Remove all command-line arguments shift $# From 9b4c35f530a4176da93555e1bf86f6c1afd3f60c Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 19 Aug 2025 02:56:35 -0700 Subject: [PATCH 21/29] Adding Korean ITN Time Signed-off-by: hmlee245 --- .../ko/data/time/time_hours.tsv | 12 ++++ .../ko/data/time/time_minutes_seconds.tsv | 60 ++++++++++++++++ .../ko/taggers/fraction.py | 2 +- .../ko/taggers/time.py | 63 +++++++++++++++++ .../ko/taggers/tokenize_and_classify.py | 5 ++ .../ko/verbalizers/time.py | 70 +++++++++++++++++++ .../ko/verbalizers/verbalize.py | 6 +- .../test_cases_time.txt | 19 +++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 ++ tests/nemo_text_processing/ko/test_time.py | 32 +++++++++ 10 files changed, 272 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/ko/test_time.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv new file mode 100644 index 000000000..24b980aa1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv @@ -0,0 +1,12 @@ +한 1 +두 2 +세 3 +네 4 +다섯 5 +여섯 6 +일곱 7 +여덟 8 +아홉 9 +열 10 +열한 11 +열두 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv new file mode 100644 index 000000000..efd37f03f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv @@ -0,0 +1,60 @@ +영 0 +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 +십 10 +십일 11 +십이 12 +십삼 13 +십사 14 +십오 15 +십육 16 +십칠 17 +십팔 18 +십구 19 +이십 20 +이십일 21 +이십이 22 +이십삼 23 +이십사 24 +이십오 25 +이십육 26 +이십칠 27 +이십팔 28 +이십구 29 +삼십 30 +삼십일 31 +삼십이 32 +삼십삼 33 +삼십사 34 +삼십오 35 +삼십육 36 +삼십칠 37 +삼십팔 38 +삼십구 39 +사십 40 +사십일 41 +사십이 42 +사십삼 43 +사십사 44 +사십오 45 +사십육 46 +사십칠 47 +사십팔 48 +사십구 49 +오십 50 +오십일 51 +오십이 52 +오십삼 53 +오십사 54 +오십오 55 +오십육 56 +오십칠 57 +오십팔 58 +오십구 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py index faf522133..d32af2604 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py @@ -25,7 +25,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): Fitite state transducer for classifying fractions e.g., fraction { denominator: "사" numerator: "삼" } -> 3/4 - fraction { mixed number: "일" denominator: "사" numerator: "삼" } -> 1 3/4 + fraction { integer_part: "일" denominator: "사" numerator: "삼" } -> 1 3/4 fraction { denominator: "루트삼" numerator: "일" } -> 1/√3 fraction { denominator: "일점육오" numerator: "오십" } -> 50/1.65 fraction { denominator: "이루트육" numerator: "삼" } -> 3/2√6 diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py new file mode 100644 index 000000000..5f2000d89 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } + e.g. 12분전 -> time { minutes: "12" suffix: "전" } + e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } + e.g. 두시반 -> time { hours: "2" minutes: "30" } + e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } + """ + def __init__(self): + super().__init__(name="time", kind="classify") + + # 1-12 for hours + graph_hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) + # 0-59 for minutes, seconds + graph_minutes = pynini.string_file(get_abs_path("data/time/time_minutes_seconds.tsv")) + # Special expression for 30 minute + graph_half = pynini.cross("반", "30") + + hour_component = (pynutil.insert("hours: \"") + (graph_hours + pynutil.delete("시")) + pynutil.insert("\"")) + + minute_component = (pynutil.insert("minutes: \"") + ((graph_minutes + pynutil.delete("분")) | graph_half) + pynutil.insert("\"")) + + second_component = (pynutil.insert("seconds: \"") + (graph_minutes + pynutil.delete("초")) + pynutil.insert("\"")) + + hour = pynini.closure(hour_component, 0, 1) + minute = pynini.closure(delete_space + minute_component, 0, 1) + second = pynini.closure(delete_space + second_component , 0, 1) + + graph_regular = hour + minute + second + + # 오전 = AM, 오후 = PM + prefix_words = pynini.accep("오전") | pynini.accep("오후") + prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") + + # 전 = before, 후 = after + suffix_words = pynini.accep("전") | pynini.accep("후") + suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") + + final_graph = pynini.closure(delete_space + prefix_tag, 0, 1) + graph_regular + pynini.closure(delete_space + suffix_tag, 0, 1) + + self.fst = self.add_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 10bd1791f..98ce064f9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -23,6 +23,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -70,6 +71,9 @@ def __init__( fraction = FractionFst(cardinal, decimal) fraction_graph = fraction.fst + time = TimeFst() + time_graph = time.fst + word_graph = WordFst().fst classify = ( @@ -77,6 +81,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.0) + | pynutil.add_weight(time_graph, 1.0) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py new file mode 100644 index 000000000..11e174b82 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_DIGIT, + GraphFst, + delete_space, + NEMO_NOT_QUOTE +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } + e.g. 12분전 -> time { minutes: "12" suffix: "전" } + e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } + e.g. 두시반 -> time { hours: "2" minutes: "30" } + e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } + """ + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hours_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + minutes_component = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + seconds_component = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + prefix_component = pynutil.delete("prefix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + # Add a leading zero to single-digit minutes/seconds + single_digit = NEMO_DIGIT + leading_zero = pynutil.insert("0") + single_digit + add_leading_zero = pynini.union(single_digit @ leading_zero, pynini.closure(NEMO_DIGIT, 2)) + + minutes = minutes_component @ add_leading_zero + seconds = seconds_component @ add_leading_zero + + # Defining all the possible combinations + path_h = hours_component + pynutil.insert(":00") + path_m = minutes + path_s = seconds + + path_hm = hours_component + delete_space + pynutil.insert(":") + minutes + path_hs = hours_component + delete_space + pynutil.insert(":") + pynutil.insert("00") + delete_space + pynutil.insert(":") + seconds + path_ms = minutes + delete_space + pynutil.insert(":") + seconds + + path_hms = hours_component + delete_space + pynutil.insert(":") + minutes + delete_space + pynutil.insert(":") + seconds + + time_graph = pynini.union(path_h, path_m, path_s, path_hm, path_hs, path_ms, path_hms) + + # Adding prefix and suffix space + optional_prefix_out = pynini.closure(delete_space + prefix_component, 0, 1) + optional_suffix_out = pynini.closure(delete_space + pynutil.insert(" ") + suffix_component, 0, 1) + + final_graph = optional_prefix_out + time_graph + optional_suffix_out + self.fst = self.delete_tokens(delete_space + final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 567cdc695..917b519a0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -18,6 +18,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -42,5 +43,8 @@ def __init__(self): fraction = FractionFst() fraction_graph = fraction.fst - graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph + time = TimeFst() + time_graph = time.fst + + graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph self.fst = graph diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..fe0615dec --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,19 @@ +두시~2:00 +열두시~12:00 +삼십분~30 +오초~05 +두시 삼십분~2:30 +세시 삼분~3:03 +두시 반~2:30 +열두시 반~12:30 +삼십분 오초~30:05 +삼분 오초~03:05 +두시 오초~2:00:05 +두시 삼십분 오초~2:30:05 +오전두시~오전2:00 +오후네시반~오후4:30 +두시전~2:00 전 +두시십분후~2:10 후 +한시 십오분 삼십초~1:15:30 +네시 이분~4:02 +열한시 오십구분~11:59 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index a63c08f84..a08d792e7 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -42,6 +42,11 @@ testITNFraction() { runtest $input } +testITNTime() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + # Remove all command-line arguments shift $# diff --git a/tests/nemo_text_processing/ko/test_time.py b/tests/nemo_text_processing/ko/test_time.py new file mode 100644 index 000000000..c5e0f71d3 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_time.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected From b7852af3dd0dad02e4447ac887a19d1f745a8ff4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 10:07:25 +0000 Subject: [PATCH 22/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/decimal.py | 10 +++- .../ko/taggers/fraction.py | 60 ++++++++++++++----- .../ko/taggers/time.py | 28 ++++++--- .../ko/taggers/tokenize_and_classify.py | 2 +- .../ko/verbalizers/fraction.py | 4 +- .../ko/verbalizers/time.py | 33 +++++++--- .../ko/verbalizers/verbalize.py | 4 +- 7 files changed, 104 insertions(+), 37 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index dd6f05bb3..ecb92df1d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -47,8 +47,12 @@ def __init__(self, cardinal: GraphFst): integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") fractional_part = pynutil.insert("fractional_part: \"") + decimal_part + pynutil.insert("\"") - graph_decimal_regular = integer_part + decimal_point + pynutil.insert(NEMO_SPACE) + fractional_part # Regular decimal like 1.5 - graph_deicimal_larger = get_quantity(graph_decimal_regular) # If decimal is used to express big numbers like 15000 -> "1.5만" + graph_decimal_regular = ( + integer_part + decimal_point + pynutil.insert(NEMO_SPACE) + fractional_part + ) # Regular decimal like 1.5 + graph_deicimal_larger = get_quantity( + graph_decimal_regular + ) # If decimal is used to express big numbers like 15000 -> "1.5만" self.decimal = graph_decimal_regular | graph_deicimal_larger self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py index d0250d2e9..f7a11e046 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst class FractionFst(GraphFst): @@ -50,10 +50,18 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_mixed_number = ( pynutil.insert("integer_part: \"") + ( - decimal | (decimal + connecting_word) | (root_word + decimal) | (cardinal + root_word + decimal) - | (root_word + decimal + connecting_word) | (cardinal + root_word + decimal + connecting_word) - | cardinal | (cardinal + connecting_word) | (root_word + cardinal) | (cardinal + root_word + cardinal) - | (root_word + cardinal + connecting_word) | (cardinal + root_word + cardinal + connecting_word) + decimal + | (decimal + connecting_word) + | (root_word + decimal) + | (cardinal + root_word + decimal) + | (root_word + decimal + connecting_word) + | (cardinal + root_word + decimal + connecting_word) + | cardinal + | (cardinal + connecting_word) + | (root_word + cardinal) + | (cardinal + root_word + cardinal) + | (root_word + cardinal + connecting_word) + | (cardinal + root_word + cardinal + connecting_word) ) + pynutil.insert("\"") ) @@ -61,8 +69,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_denominator = ( pynutil.insert("denominator: \"") + ( - (decimal | (cardinal + root_word + decimal) | (root_word + decimal) - | cardinal | (cardinal + root_word + cardinal) | (root_word + cardinal)) + ( + decimal + | (cardinal + root_word + decimal) + | (root_word + decimal) + | cardinal + | (cardinal + root_word + cardinal) + | (root_word + cardinal) + ) + pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) ) + pynutil.insert("\"") @@ -71,21 +85,39 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_numerator = ( pynutil.insert("numerator: \"") + ( - (decimal | (cardinal + root_word + decimal) | (root_word + decimal) - | cardinal | (cardinal + root_word + cardinal) | (root_word + cardinal)) + ( + decimal + | (cardinal + root_word + decimal) + | (root_word + decimal) + | cardinal + | (cardinal + root_word + cardinal) + | (root_word + cardinal) + ) + pynini.closure(pynutil.delete(NEMO_SPACE)) ) + pynutil.insert("\"") ) - graph_fraction_sign = (graph_sign + pynutil.insert(NEMO_SPACE) + graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator) - graph_fraction_no_sign = (graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator) + graph_fraction_sign = ( + graph_sign + + pynutil.insert(NEMO_SPACE) + + graph_denominator + + pynutil.insert(NEMO_SPACE) + + fraction_word + + graph_numerator + ) + graph_fraction_no_sign = graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator # Only fraction like "1/3" or "- 1/3" - graph_fractions = (graph_fraction_sign | graph_fraction_no_sign) + graph_fractions = graph_fraction_sign | graph_fraction_no_sign # Mixed number fraction like "2 1/3" or "-2 1/3" graph_mixed_number_fraction = ( - pynini.closure((graph_sign + pynutil.insert(" ")), 0, 1) + pynutil.add_weight(graph_mixed_number, 1.1) - + pynutil.insert(NEMO_SPACE) + graph_denominator + pynutil.insert(NEMO_SPACE) + fraction_word + graph_numerator + pynini.closure((graph_sign + pynutil.insert(" ")), 0, 1) + + pynutil.add_weight(graph_mixed_number, 1.1) + + pynutil.insert(NEMO_SPACE) + + graph_denominator + + pynutil.insert(NEMO_SPACE) + + fraction_word + + graph_numerator ) final_graph = graph_fractions | graph_mixed_number_fraction diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index 5f2000d89..b9ca9dbfe 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -16,8 +16,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class TimeFst(GraphFst): """ @@ -28,6 +29,7 @@ class TimeFst(GraphFst): e.g. 두시반 -> time { hours: "2" minutes: "30" } e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } """ + def __init__(self): super().__init__(name="time", kind="classify") @@ -38,18 +40,24 @@ def __init__(self): # Special expression for 30 minute graph_half = pynini.cross("반", "30") - hour_component = (pynutil.insert("hours: \"") + (graph_hours + pynutil.delete("시")) + pynutil.insert("\"")) + hour_component = pynutil.insert("hours: \"") + (graph_hours + pynutil.delete("시")) + pynutil.insert("\"") - minute_component = (pynutil.insert("minutes: \"") + ((graph_minutes + pynutil.delete("분")) | graph_half) + pynutil.insert("\"")) + minute_component = ( + pynutil.insert("minutes: \"") + + ((graph_minutes + pynutil.delete("분")) | graph_half) + + pynutil.insert("\"") + ) - second_component = (pynutil.insert("seconds: \"") + (graph_minutes + pynutil.delete("초")) + pynutil.insert("\"")) + second_component = ( + pynutil.insert("seconds: \"") + (graph_minutes + pynutil.delete("초")) + pynutil.insert("\"") + ) hour = pynini.closure(hour_component, 0, 1) minute = pynini.closure(delete_space + minute_component, 0, 1) - second = pynini.closure(delete_space + second_component , 0, 1) + second = pynini.closure(delete_space + second_component, 0, 1) graph_regular = hour + minute + second - + # 오전 = AM, 오후 = PM prefix_words = pynini.accep("오전") | pynini.accep("오후") prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") @@ -58,6 +66,10 @@ def __init__(self): suffix_words = pynini.accep("전") | pynini.accep("후") suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") - final_graph = pynini.closure(delete_space + prefix_tag, 0, 1) + graph_regular + pynini.closure(delete_space + suffix_tag, 0, 1) + final_graph = ( + pynini.closure(delete_space + prefix_tag, 0, 1) + + graph_regular + + pynini.closure(delete_space + suffix_tag, 0, 1) + ) - self.fst = self.add_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 98ce064f9..5325be602 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -21,8 +21,8 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py index 4e04f65e3..7b0845dc1 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/fraction.py @@ -53,7 +53,9 @@ def __init__(self): regular_graph = ( pynini.closure((sign_component + pynutil.delete(NEMO_SPACE)), 0, 1) - + pynini.closure(mixed_number_component + pynutil.delete(NEMO_SPACE) + pynutil.insert(NEMO_NON_BREAKING_SPACE)) + + pynini.closure( + mixed_number_component + pynutil.delete(NEMO_SPACE) + pynutil.insert(NEMO_NON_BREAKING_SPACE) + ) + numerator_component + pynutil.delete(NEMO_SPACE) + pynutil.insert("/") diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py index 11e174b82..a1d264caa 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py @@ -17,9 +17,9 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( NEMO_DIGIT, + NEMO_NOT_QUOTE, GraphFst, delete_space, - NEMO_NOT_QUOTE ) @@ -32,6 +32,7 @@ class TimeFst(GraphFst): e.g. 두시반 -> time { hours: "2" minutes: "30" } e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } """ + def __init__(self): super().__init__(name="time", kind="verbalize") @@ -39,32 +40,48 @@ def __init__(self): minutes_component = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") seconds_component = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - prefix_component = pynutil.delete("prefix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + prefix_component = pynutil.delete("prefix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") # Add a leading zero to single-digit minutes/seconds single_digit = NEMO_DIGIT leading_zero = pynutil.insert("0") + single_digit add_leading_zero = pynini.union(single_digit @ leading_zero, pynini.closure(NEMO_DIGIT, 2)) - + minutes = minutes_component @ add_leading_zero seconds = seconds_component @ add_leading_zero - + # Defining all the possible combinations path_h = hours_component + pynutil.insert(":00") path_m = minutes path_s = seconds path_hm = hours_component + delete_space + pynutil.insert(":") + minutes - path_hs = hours_component + delete_space + pynutil.insert(":") + pynutil.insert("00") + delete_space + pynutil.insert(":") + seconds + path_hs = ( + hours_component + + delete_space + + pynutil.insert(":") + + pynutil.insert("00") + + delete_space + + pynutil.insert(":") + + seconds + ) path_ms = minutes + delete_space + pynutil.insert(":") + seconds - path_hms = hours_component + delete_space + pynutil.insert(":") + minutes + delete_space + pynutil.insert(":") + seconds + path_hms = ( + hours_component + + delete_space + + pynutil.insert(":") + + minutes + + delete_space + + pynutil.insert(":") + + seconds + ) time_graph = pynini.union(path_h, path_m, path_s, path_hm, path_hs, path_ms, path_hms) # Adding prefix and suffix space optional_prefix_out = pynini.closure(delete_space + prefix_component, 0, 1) optional_suffix_out = pynini.closure(delete_space + pynutil.insert(" ") + suffix_component, 0, 1) - + final_graph = optional_prefix_out + time_graph + optional_suffix_out - self.fst = self.delete_tokens(delete_space + final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(delete_space + final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 917b519a0..f227bcf7c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -16,8 +16,8 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -46,5 +46,5 @@ def __init__(self): time = TimeFst() time_graph = time.fst - graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph + graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph self.fst = graph From 736ca34b50160da2af9ed67d6216ab1cf5b5a690 Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Thu, 28 Aug 2025 16:46:28 -0700 Subject: [PATCH 23/29] Changes to time ITN and draft for date ITN Signed-off-by: Hyunmin Lee --- .../ko/data/months.tsv | 12 +++ .../ko/data/time/time_minutes_seconds.tsv | 60 -------------- .../ko/taggers/date.py | 78 +++++++++++++++++++ .../ko/taggers/time.py | 47 +++++++++-- .../ko/taggers/tokenize_and_classify.py | 5 ++ .../ko/verbalizers/date.py | 50 ++++++++++++ .../ko/verbalizers/verbalize.py | 8 +- .../test_cases_date.txt | 16 ++++ .../test_cases_time.txt | 4 - tests/nemo_text_processing/ko/test_date.py | 32 ++++++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 ++ 11 files changed, 244 insertions(+), 73 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/months.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/ko/test_date.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv new file mode 100644 index 000000000..52039ef35 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv @@ -0,0 +1,12 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +유 6 +칠 7 +팔 8 +구 9 +시 10 +십일 11 +십이 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv deleted file mode 100644 index efd37f03f..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_minutes_seconds.tsv +++ /dev/null @@ -1,60 +0,0 @@ -영 0 -일 1 -이 2 -삼 3 -사 4 -오 5 -육 6 -칠 7 -팔 8 -구 9 -십 10 -십일 11 -십이 12 -십삼 13 -십사 14 -십오 15 -십육 16 -십칠 17 -십팔 18 -십구 19 -이십 20 -이십일 21 -이십이 22 -이십삼 23 -이십사 24 -이십오 25 -이십육 26 -이십칠 27 -이십팔 28 -이십구 29 -삼십 30 -삼십일 31 -삼십이 32 -삼십삼 33 -삼십사 34 -삼십오 35 -삼십육 36 -삼십칠 37 -삼십팔 38 -삼십구 39 -사십 40 -사십일 41 -사십이 42 -사십삼 43 -사십사 44 -사십오 45 -사십육 46 -사십칠 47 -사십팔 48 -사십구 49 -오십 50 -오십일 51 -오십이 52 -오십삼 53 -오십사 54 -오십오 55 -오십육 56 -오십칠 57 -오십팔 58 -오십구 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py new file mode 100644 index 000000000..7a58f518a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -0,0 +1,78 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } + e.g. 오월 -> date { month: "5" } + e.g. 칠일 -> date { day: "7" } + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="date", kind="classify") + + cardinal = cardinal.just_cardinals + month = pynini.string_file(get_abs_path("data/months.tsv")) + + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + year_suffix = pynini.cross("년", "") + month_suffix = pynini.cross("월", "") + day_suffix = pynini.cross("일", "") + + year_component = ( + pynutil.insert("year: \"") + + cardinal + + pynini.closure(year_suffix, 0, 1) + + pynutil.insert("\"") + ) + + month_component = ( + pynutil.insert("month: \"") + + spacing + + month + + pynini.closure(month_suffix, 0, 1) + + pynutil.insert("\"") + ) + + day_component = ( + pynutil.insert("day: \"") + + spacing + + cardinal + + day_suffix + + spacing + + pynutil.insert("\"") + ) + + graph_component = year_component | month_component | day_component + graph_date = ( + pynini.closure(year_component, 0, 1) + + pynini.closure((pynutil.insert(NEMO_SPACE)) + month_component, 0, 1) + + pynini.closure((pynutil.insert(NEMO_SPACE)) + day_component, 0, 1) + ) + + final_graph = graph_component | graph_date + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() + diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index b9ca9dbfe..63c85487f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, NEMO_SPACE from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -33,23 +33,56 @@ class TimeFst(GraphFst): def __init__(self): super().__init__(name="time", kind="classify") + # 1-9 in cardinals for minutes and seconds + cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + cardinal_zero = pynini.cross("영", "0") + + graph_tens_prefix = pynini.union( + pynini.cross("이", "2"), + pynini.cross("삼", "3"), + pynini.cross("사", "4"), + pynini.cross("오", "5") + ) + # Graphing 10-19 + graph_ten = pynini.union( + pynini.cross("십", "10"), + pynini.cross("십", "1") + cardinal_digit + ).optimize() + # Graphing 20-59 + graph_tens = ( + (graph_tens_prefix + pynini.cross("십", "0")) + | (graph_tens_prefix + pynini.cross("십", "") + cardinal_digit) + ) + + graph_0_to_59 = pynini.union( + cardinal_zero, + cardinal_digit, + graph_ten, + graph_tens + ).optimize() + # 1-12 for hours graph_hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) - # 0-59 for minutes, seconds - graph_minutes = pynini.string_file(get_abs_path("data/time/time_minutes_seconds.tsv")) # Special expression for 30 minute graph_half = pynini.cross("반", "30") - hour_component = pynutil.insert("hours: \"") + (graph_hours + pynutil.delete("시")) + pynutil.insert("\"") + # Adding space if there are one + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + hour_suffix = pynini.cross("시", "") + minute_suffix = pynini.cross("분", "") + second_suffix = pynini.cross("초", "") + + hour_component = pynutil.insert("hours: \"") + (graph_hours + spacing + hour_suffix) + pynutil.insert("\"") minute_component = ( pynutil.insert("minutes: \"") - + ((graph_minutes + pynutil.delete("분")) | graph_half) + + ((graph_0_to_59 + spacing + minute_suffix) | graph_half) + pynutil.insert("\"") ) second_component = ( - pynutil.insert("seconds: \"") + (graph_minutes + pynutil.delete("초")) + pynutil.insert("\"") + pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"") ) hour = pynini.closure(hour_component, 0, 1) @@ -59,7 +92,7 @@ def __init__(self): graph_regular = hour + minute + second # 오전 = AM, 오후 = PM - prefix_words = pynini.accep("오전") | pynini.accep("오후") + prefix_words = (pynini.accep("오전") + spacing) | (pynini.accep("오후") + spacing) | (pynini.accep("새벽") + spacing) | (pynini.accep("아침") + spacing) prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") # 전 = before, 후 = after diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 5325be602..3994b232f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -24,6 +24,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -74,6 +75,9 @@ def __init__( time = TimeFst() time_graph = time.fst + date = DateFst(cardinal) + date_graph = date.fst + word_graph = WordFst().fst classify = ( @@ -82,6 +86,7 @@ def __init__( | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.0) | pynutil.add_weight(time_graph, 1.0) + | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py new file mode 100644 index 000000000..83d3611f8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, NEMO_SPACE + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } + e.g. 오월 -> date { month: "5" } + e.g. 칠일 -> date { day: "7" } + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + year_component = ( + pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("년") + pynutil.delete("\"") + ) + month_component = ( + pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("월") + pynutil.delete("\"") + ) + day_component = ( + pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("일") + pynutil.delete("\"") + ) + + graph = ( + pynini.closure(pynutil.delete(NEMO_SPACE) + year_component, 0, 1) + + pynini.closure(pynutil.delete(NEMO_SPACE) + month_component, 0, 1) + + pynini.closure(pynutil.delete(NEMO_SPACE) + day_component, 0, 1) + ) + + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index f227bcf7c..56a109bae 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -46,5 +47,8 @@ def __init__(self): time = TimeFst() time_graph = time.fst - graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph - self.fst = graph + date = DateFst() + date_graph = date.fst + + graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph | date_graph + self.fst = graph \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..ecad6dc19 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,16 @@ +이천이십사년팔월이십팔일~2024년8월28일 +이천이십삼년 구월 오일~2023년 9월 5일 +천구백구십구년십이월삼십일일~1999년12월31일 +이천년 이월 이십구일~2000년 2월 29일 +이천십년시월십일~2010년10월10일 +이천이십일년유월십육일~2021년6월16일 +이천삼십년삼월십사일~2030년3월14일 +천구백팔십팔년 오월 이십일~1988년 5월 20일 +이천일년 칠월 구일~2001년 7월 9일 +이천십팔년사월삼십일~2018년4월30일 +삼천년팔월십오일~3000년8월15일 +이천구년 일월이십일~2009년 1월20일 +이천삼십오년~2035년 +오월~5월 +이십사일~24일 +구천구백구십구년삼월일일~9999년3월1일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt index fe0615dec..450039132 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt @@ -1,13 +1,9 @@ 두시~2:00 열두시~12:00 -삼십분~30 -오초~05 두시 삼십분~2:30 세시 삼분~3:03 두시 반~2:30 열두시 반~12:30 -삼십분 오초~30:05 -삼분 오초~03:05 두시 오초~2:00:05 두시 삼십분 오초~2:30:05 오전두시~오전2:00 diff --git a/tests/nemo_text_processing/ko/test_date.py b/tests/nemo_text_processing/ko/test_date.py new file mode 100644 index 000000000..f26513a15 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_date.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index a08d792e7..854aeafe7 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -47,6 +47,11 @@ testITNTime() { runtest $input } +testITNDate() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + # Remove all command-line arguments shift $# From 9f01195ba2e927e3280fb0752cf7e1b39ac471a1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Aug 2025 23:48:19 +0000 Subject: [PATCH 24/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/date.py | 23 +++---------- .../ko/taggers/time.py | 33 ++++++++----------- .../ko/taggers/tokenize_and_classify.py | 2 +- .../ko/verbalizers/date.py | 4 +-- .../ko/verbalizers/verbalize.py | 4 +-- 5 files changed, 22 insertions(+), 44 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py index 7a58f518a..b9de5c299 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -41,28 +41,14 @@ def __init__(self, cardinal: GraphFst): day_suffix = pynini.cross("일", "") year_component = ( - pynutil.insert("year: \"") - + cardinal - + pynini.closure(year_suffix, 0, 1) - + pynutil.insert("\"") + pynutil.insert("year: \"") + cardinal + pynini.closure(year_suffix, 0, 1) + pynutil.insert("\"") ) month_component = ( - pynutil.insert("month: \"") - + spacing - + month - + pynini.closure(month_suffix, 0, 1) - + pynutil.insert("\"") + pynutil.insert("month: \"") + spacing + month + pynini.closure(month_suffix, 0, 1) + pynutil.insert("\"") ) - day_component = ( - pynutil.insert("day: \"") - + spacing - + cardinal - + day_suffix - + spacing - + pynutil.insert("\"") - ) + day_component = pynutil.insert("day: \"") + spacing + cardinal + day_suffix + spacing + pynutil.insert("\"") graph_component = year_component | month_component | day_component graph_date = ( @@ -75,4 +61,3 @@ def __init__(self, cardinal: GraphFst): final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() - diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index 63c85487f..d804f5999 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -36,30 +36,18 @@ def __init__(self): # 1-9 in cardinals for minutes and seconds cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) cardinal_zero = pynini.cross("영", "0") - + graph_tens_prefix = pynini.union( - pynini.cross("이", "2"), - pynini.cross("삼", "3"), - pynini.cross("사", "4"), - pynini.cross("오", "5") + pynini.cross("이", "2"), pynini.cross("삼", "3"), pynini.cross("사", "4"), pynini.cross("오", "5") ) # Graphing 10-19 - graph_ten = pynini.union( - pynini.cross("십", "10"), - pynini.cross("십", "1") + cardinal_digit - ).optimize() + graph_ten = pynini.union(pynini.cross("십", "10"), pynini.cross("십", "1") + cardinal_digit).optimize() # Graphing 20-59 - graph_tens = ( - (graph_tens_prefix + pynini.cross("십", "0")) - | (graph_tens_prefix + pynini.cross("십", "") + cardinal_digit) + graph_tens = (graph_tens_prefix + pynini.cross("십", "0")) | ( + graph_tens_prefix + pynini.cross("십", "") + cardinal_digit ) - graph_0_to_59 = pynini.union( - cardinal_zero, - cardinal_digit, - graph_ten, - graph_tens - ).optimize() + graph_0_to_59 = pynini.union(cardinal_zero, cardinal_digit, graph_ten, graph_tens).optimize() # 1-12 for hours graph_hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) @@ -92,7 +80,12 @@ def __init__(self): graph_regular = hour + minute + second # 오전 = AM, 오후 = PM - prefix_words = (pynini.accep("오전") + spacing) | (pynini.accep("오후") + spacing) | (pynini.accep("새벽") + spacing) | (pynini.accep("아침") + spacing) + prefix_words = ( + (pynini.accep("오전") + spacing) + | (pynini.accep("오후") + spacing) + | (pynini.accep("새벽") + spacing) + | (pynini.accep("아침") + spacing) + ) prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") # 전 = before, 후 = after diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 3994b232f..e57d43dd3 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -20,11 +20,11 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py index 83d3611f8..88ed973df 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class DateFst(GraphFst): @@ -39,7 +39,7 @@ def __init__(self): day_component = ( pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("일") + pynutil.delete("\"") ) - + graph = ( pynini.closure(pynutil.delete(NEMO_SPACE) + year_component, 0, 1) + pynini.closure(pynutil.delete(NEMO_SPACE) + month_component, 0, 1) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 56a109bae..51a3a8600 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -15,11 +15,11 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -51,4 +51,4 @@ def __init__(self): date_graph = date.fst graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph | date_graph - self.fst = graph \ No newline at end of file + self.fst = graph From 2d51a3904320e9cd4d7d64f1ccc00db0fc3ad054 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 23 Sep 2025 05:00:52 -0700 Subject: [PATCH 25/29] Adding money to the Korean ITN Signed-off-by: hmlee245 --- .../ko/data/currency.tsv | 8 +++ .../ko/taggers/money.py | 59 +++++++++++++++++++ .../ko/taggers/tokenize_and_classify.py | 5 ++ .../ko/verbalizers/money.py | 49 +++++++++++++++ .../ko/verbalizers/time.py | 3 +- .../ko/verbalizers/verbalize.py | 6 +- .../test_cases_money.txt | 30 ++++++++++ tests/nemo_text_processing/ko/test_money.py | 32 ++++++++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 ++ 9 files changed, 195 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/money.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/ko/test_money.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv new file mode 100644 index 000000000..516cf5c0a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv @@ -0,0 +1,8 @@ +달러 $ +유로 € +엔 ¥ +파운드 £ +위안 ¥ +페소 $ +루피 ₹ +원 ₩ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py new file mode 100644 index 000000000..49a8b216c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -0,0 +1,59 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_DIGIT, + GraphFst, + convert_space, + delete_extra_space +) +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "원" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="money", kind="classify") + + cardinals = cardinal.just_cardinals + currency = pynini.string_file(get_abs_path("data/currency.tsv")) + + graph_unit = ( + pynutil.insert('currency: "') + + currency + + pynutil.insert('"') + ) + + # Main graph for integer money amounts + # Structure: + + + graph_integer = ( + pynutil.insert('integer_part: "') + + cardinals + + pynutil.insert('"') + + delete_extra_space # Handles optional spacing + + graph_unit + ) + + final_graph = self.add_tokens(graph_integer) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 3994b232f..7a3f2a01e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -25,6 +25,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -78,6 +79,9 @@ def __init__( date = DateFst(cardinal) date_graph = date.fst + money = MoneyFst(cardinal) + money_graph = money.fst + word_graph = WordFst().fst classify = ( @@ -87,6 +91,7 @@ def __init__( | pynutil.add_weight(fraction_graph, 1.0) | pynutil.add_weight(time_graph, 1.0) | pynutil.add_weight(date_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py new file mode 100644 index 000000000..eb8ce7257 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst, delete_space + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "원" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self): + super().__init__(name="money", kind="verbalize") + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete('"') + ) + + unit = ( + pynutil.delete("currency:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete('"') + ) + graph = unit + delete_space + integer + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py index a1d264caa..e1e3755b1 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py @@ -20,6 +20,7 @@ NEMO_NOT_QUOTE, GraphFst, delete_space, + NEMO_SPACE ) @@ -81,7 +82,7 @@ def __init__(self): # Adding prefix and suffix space optional_prefix_out = pynini.closure(delete_space + prefix_component, 0, 1) - optional_suffix_out = pynini.closure(delete_space + pynutil.insert(" ") + suffix_component, 0, 1) + optional_suffix_out = pynini.closure(delete_space + pynutil.insert(NEMO_SPACE) + suffix_component, 0, 1) final_graph = optional_prefix_out + time_graph + optional_suffix_out self.fst = self.delete_tokens(delete_space + final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 56a109bae..01fdd3e1f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -20,6 +20,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -50,5 +51,8 @@ def __init__(self): date = DateFst() date_graph = date.fst - graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph | date_graph + money = MoneyFst() + money_graph = money.fst + + graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph | date_graph | money_graph self.fst = graph \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..14cdea536 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,30 @@ +오천 원~₩5,000 +만오천원~₩15,000 +십이만 삼천 원~₩123,000 +백 원~₩100 +천백십일 원~₩1,111 +육십만 원~₩600,000 +백만 원~₩1,000,000 +삼백오십만 원~₩3,500,000 +천이백만 원~₩12,000,000 +일억 원~₩100,000,000 +십이억 오천만 원~₩1,250,000,000 +백억 원~₩10,000,000,000 +오천억~₩500,000,000,000 +일조 원~₩1,000,000,000,000 +삼조 오천억 원~₩3,500,000,000,000 +영원~₩0 +구십구 원~₩99 +이공이오 원~₩2,025 +만 원~₩10,000 +일만 원~₩10,000 +십오 달러~$15 +이십불~$20 +천오백 불~$1,500 +백만 달러~$1,000,000 +오십 유로~€50 +천 엔~¥1,000 +만 엔~¥10,000 +백 파운드~£100 +이십 위안~¥20 +구천구백구십구원~₩9,999 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_money.py b/tests/nemo_text_processing/ko/test_money.py new file mode 100644 index 000000000..881a1ee4e --- /dev/null +++ b/tests/nemo_text_processing/ko/test_money.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index 854aeafe7..2843a88c0 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -52,6 +52,11 @@ testITNDate() { runtest $input } +testITNMoney() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + # Remove all command-line arguments shift $# From 19dbc2455910972174f6aa9ee7e08cdf435b0666 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 23 Sep 2025 05:09:04 -0700 Subject: [PATCH 26/29] Adding money to the Korean ITN Signed-off-by: hmlee245 --- .../inverse_text_normalization/ko/taggers/money.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index 49a8b216c..f02049e33 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -19,7 +19,8 @@ NEMO_DIGIT, GraphFst, convert_space, - delete_extra_space + delete_extra_space, + NEMO_SPACE ) from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path From e5252d1f9631d1628eda5327407268e0dc61bcba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Sep 2025 19:59:22 +0000 Subject: [PATCH 27/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/money.py | 8 ++------ .../ko/taggers/tokenize_and_classify.py | 3 +-- .../inverse_text_normalization/ko/verbalizers/money.py | 4 ++-- .../inverse_text_normalization/ko/verbalizers/time.py | 2 +- .../ko/verbalizers/verbalize.py | 3 +-- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index f02049e33..d4dcdc3aa 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -17,10 +17,10 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( NEMO_DIGIT, + NEMO_SPACE, GraphFst, convert_space, delete_extra_space, - NEMO_SPACE ) from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -40,11 +40,7 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals currency = pynini.string_file(get_abs_path("data/currency.tsv")) - graph_unit = ( - pynutil.insert('currency: "') - + currency - + pynutil.insert('"') - ) + graph_unit = pynutil.insert('currency: "') + currency + pynutil.insert('"') # Main graph for integer money amounts # Structure: + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index e54d8e3a8..2dc4661dd 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -23,10 +23,9 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py index eb8ce7257..2a09c6bb9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py @@ -36,7 +36,7 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete('"') ) - + unit = ( pynutil.delete("currency:") + delete_space @@ -46,4 +46,4 @@ def __init__(self): ) graph = unit + delete_space + integer delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py index e1e3755b1..4b63ade99 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py @@ -18,9 +18,9 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( NEMO_DIGIT, NEMO_NOT_QUOTE, + NEMO_SPACE, GraphFst, delete_space, - NEMO_SPACE ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 1fb3b3133..ce5ad8cd1 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -18,10 +18,9 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst From 46ee84891676133759aba0cad73d547b8db4bab2 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Mon, 13 Oct 2025 20:16:26 -0700 Subject: [PATCH 28/29] Addition of telephone class, fixing time, money, date Signed-off-by: hmlee245 --- .../ko/data/currency.tsv | 1 + .../ko/data/time/time_hours.tsv | 13 ++++- .../ko/taggers/money.py | 31 +++++----- .../ko/taggers/telephone.py | 57 +++++++++++++++++++ .../ko/taggers/time.py | 26 ++++++--- .../ko/taggers/tokenize_and_classify.py | 5 ++ .../ko/verbalizers/money.py | 11 ++-- .../ko/verbalizers/telephone.py | 34 +++++++++++ .../ko/verbalizers/verbalize.py | 22 ++++++- .../test_cases_money.txt | 43 +++++++------- .../test_cases_telephone.txt | 11 ++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 ++ .../nemo_text_processing/ko/test_telephone.py | 32 +++++++++++ 13 files changed, 239 insertions(+), 52 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/ko/test_telephone.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv index 516cf5c0a..fd2127530 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv +++ b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv @@ -1,4 +1,5 @@ 달러 $ +불 $ 유로 € 엔 ¥ 파운드 £ diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv index 24b980aa1..8044e4006 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv @@ -9,4 +9,15 @@ 아홉 9 열 10 열한 11 -열두 12 \ No newline at end of file +열두 12 +열세 13 +열네 14 +열다섯 15 +열여섯 16 +열일곱 17 +열여덟 18 +열아홉 19 +스무 20 +스물한 21 +스물두 22 +스물세 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index f02049e33..d150b8e7f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -16,10 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_DIGIT, GraphFst, - convert_space, - delete_extra_space, NEMO_SPACE ) from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -28,7 +25,7 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money - e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "원" } + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "₩" } Args: cardinal: CardinalFst @@ -40,21 +37,25 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals currency = pynini.string_file(get_abs_path("data/currency.tsv")) + # Accepting space if there are one between integer and currency + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + graph_integer = ( + pynutil.insert("integer_part: \"") + + cardinals + + pynutil.insert("\"") + + spacing + ) + graph_unit = ( - pynutil.insert('currency: "') + pynutil.insert(" currency: \"") + currency - + pynutil.insert('"') + + pynutil.insert("\"") ) - # Main graph for integer money amounts - # Structure: + + - graph_integer = ( - pynutil.insert('integer_part: "') - + cardinals - + pynutil.insert('"') - + delete_extra_space # Handles optional spacing - + graph_unit + graph_final = ( + graph_integer + graph_unit ) - final_graph = self.add_tokens(graph_integer) + final_graph = self.add_tokens(graph_final) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py new file mode 100644 index 000000000..b270def3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_extra_space, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying a generic 3-4-4 telephone number. + e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number_part: "010-1234-5678" } + + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_zero_alt = pynini.cross("공", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + + digit = graph_digit | graph_zero | graph_zero_alt + + separator = pynini.cross(pynini.union(" ", "에"), "-").optimize() + + digit_block_3 = digit + digit + digit + digit_block_4 = digit_block_3 + digit + + optional_separator = pynini.closure(separator, 0, 1) + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + phone_number_graph = ( + pynutil.insert('number_part: "') + + digit_block_3 + + optional_separator + + digit_block_4 + + optional_separator + + digit_block_4 + + pynutil.insert('"') + ) + + graph = phone_number_graph + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index d804f5999..96531bde8 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -16,6 +16,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -65,7 +66,7 @@ def __init__(self): minute_component = ( pynutil.insert("minutes: \"") - + ((graph_0_to_59 + spacing + minute_suffix) | graph_half) + + pynini.union((graph_0_to_59 + spacing + minute_suffix) | graph_half) + pynutil.insert("\"") ) @@ -80,11 +81,11 @@ def __init__(self): graph_regular = hour + minute + second # 오전 = AM, 오후 = PM - prefix_words = ( - (pynini.accep("오전") + spacing) - | (pynini.accep("오후") + spacing) - | (pynini.accep("새벽") + spacing) - | (pynini.accep("아침") + spacing) + prefix_words = pynini.union( + (pynini.accep("오전") + spacing), + (pynini.accep("오후") + spacing), + (pynini.accep("새벽") + spacing), + (pynini.accep("아침") + spacing) ) prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") @@ -92,10 +93,19 @@ def __init__(self): suffix_words = pynini.accep("전") | pynini.accep("후") suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") - final_graph = ( + time_graph = ( pynini.closure(delete_space + prefix_tag, 0, 1) + graph_regular + pynini.closure(delete_space + suffix_tag, 0, 1) ) - self.fst = self.add_tokens(final_graph).optimize() + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + #Adding cardinal graph to prevent processing out of range numbers + final_graph = pynini.union( + time_graph, + cardinal_graph + ) + + self.fst = self.add_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index e54d8e3a8..f4f46805d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -27,6 +27,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -83,6 +84,9 @@ def __init__( money = MoneyFst(cardinal) money_graph = money.fst + telephone = TelephoneFst() + telephone_graph = telephone.fst + word_graph = WordFst().fst classify = ( @@ -93,6 +97,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.0) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py index eb8ce7257..4a569c3d9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst, delete_space, NEMO_SPACE class MoneyFst(GraphFst): @@ -33,7 +33,7 @@ def __init__(self): pynutil.delete("integer_part:") + delete_space + pynutil.delete('"') - + pynini.closure(NEMO_CHAR - " ", 1) + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + pynutil.delete('"') ) @@ -41,9 +41,12 @@ def __init__(self): pynutil.delete("currency:") + delete_space + pynutil.delete('"') - + pynini.closure(NEMO_CHAR - " ", 1) + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + pynutil.delete('"') ) - graph = unit + delete_space + integer + + optional_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1).optimize() + + graph = unit + optional_space + integer delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py new file mode 100644 index 000000000..3ac213b59 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying a generic 3-4-4 telephone number. + e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number: "010-1234-5678" } + + """ + + def __init__(self): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete('number_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + delete_tokens = self.delete_tokens(number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 1fb3b3133..d966fcf9e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -12,6 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst @@ -22,6 +24,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -55,5 +58,20 @@ def __init__(self): money = MoneyFst() money_graph = money.fst - graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph | time_graph | date_graph | money_graph - self.fst = graph + telephone = TelephoneFst() + telephone_graph = telephone.fst + + word = WordFst() + word_graph = word.fst + + graph = pynini.union(cardinal_graph, + ordinal_graph, + decimal_graph, + fraction_graph, + time_graph, + date_graph, + money_graph, + telephone_graph, + word_graph + ) + self.fst = graph \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt index 14cdea536..09c6b2841 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt @@ -1,30 +1,29 @@ -오천 원~₩5,000 -만오천원~₩15,000 -십이만 삼천 원~₩123,000 +오천 원~₩5000 +만오천원~₩15000 +십이만삼천 원~₩123000 백 원~₩100 -천백십일 원~₩1,111 -육십만 원~₩600,000 -백만 원~₩1,000,000 -삼백오십만 원~₩3,500,000 -천이백만 원~₩12,000,000 -일억 원~₩100,000,000 -십이억 오천만 원~₩1,250,000,000 -백억 원~₩10,000,000,000 -오천억~₩500,000,000,000 -일조 원~₩1,000,000,000,000 -삼조 오천억 원~₩3,500,000,000,000 +천백십일 원~₩1111 +육십만 원~₩600000 +백만 원~₩1000000 +삼백오십만 원~₩3500000 +천이백만 원~₩12000000 +일억 원~₩100000000 +십이억오천만 원~₩1250000000 +백억 원~₩10000000000 +오천억원~₩500000000000 +일조 원~₩1000000000000 +삼조오천억 원~₩3500000000000 영원~₩0 구십구 원~₩99 -이공이오 원~₩2,025 -만 원~₩10,000 -일만 원~₩10,000 +만 원~₩10000 +일만 원~₩10000 십오 달러~$15 이십불~$20 -천오백 불~$1,500 -백만 달러~$1,000,000 +천오백 불~$1500 +백만 달러~$1000000 오십 유로~€50 -천 엔~¥1,000 -만 엔~¥10,000 +천 엔~¥1000 +만 엔~¥10000 백 파운드~£100 이십 위안~¥20 -구천구백구십구원~₩9,999 \ No newline at end of file +구천구백구십구원~₩9999 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..6702d4ddd --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,11 @@ +공일공에 일이삼사에 오육칠팔~010-1234-5678 +영일영 구팔칠육 오사삼이~010-9876-5432 +공이에 삼사오육에 칠팔구공~02-3456-7890 +공삼일에 구팔칠에 육오사삼~031-987-6543 +천오백팔십팔에 이공공공~1588-2000 +천오백칠십칠에 구천번~1577-9000 +일일구~119 +일일이~112 +공일공 일이삼사 오육칠팔~010-1234-5678 +공이 삼사오에 육칠팔구~02-345-6789 +공일공일이삼사오육칠팔~010-1234-5678 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index 2843a88c0..816678de3 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -57,6 +57,11 @@ testITNMoney() { runtest $input } +testITNTelephone() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + # Remove all command-line arguments shift $# diff --git a/tests/nemo_text_processing/ko/test_telephone.py b/tests/nemo_text_processing/ko/test_telephone.py new file mode 100644 index 000000000..4a3684896 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_telephone.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected From 8e3100461de7faba807487b0ff02ff9f271bbd1e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 Oct 2025 03:26:08 +0000 Subject: [PATCH 29/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/money.py | 26 +++++------------- .../ko/taggers/telephone.py | 10 +++---- .../ko/taggers/time.py | 13 ++++----- .../ko/taggers/tokenize_and_classify.py | 4 +-- .../ko/verbalizers/money.py | 7 ++++- .../ko/verbalizers/verbalize.py | 27 +++++++++---------- 6 files changed, 36 insertions(+), 51 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index d150b8e7f..f890e477e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -15,10 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - GraphFst, - NEMO_SPACE -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -40,22 +37,11 @@ def __init__(self, cardinal: GraphFst): # Accepting space if there are one between integer and currency spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - graph_integer = ( - pynutil.insert("integer_part: \"") - + cardinals - + pynutil.insert("\"") - + spacing - ) - - graph_unit = ( - pynutil.insert(" currency: \"") - + currency - + pynutil.insert("\"") - ) - - graph_final = ( - graph_integer + graph_unit - ) + graph_integer = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + spacing + + graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") + + graph_final = graph_integer + graph_unit final_graph = self.add_tokens(graph_final) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py index b270def3d..fe499a838 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_extra_space, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_extra_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -37,16 +37,16 @@ def __init__(self): separator = pynini.cross(pynini.union(" ", "에"), "-").optimize() digit_block_3 = digit + digit + digit - digit_block_4 = digit_block_3 + digit + digit_block_4 = digit_block_3 + digit optional_separator = pynini.closure(separator, 0, 1) spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) phone_number_graph = ( - pynutil.insert('number_part: "') + - digit_block_3 + pynutil.insert('number_part: "') + + digit_block_3 + optional_separator - + digit_block_4 + + digit_block_4 + optional_separator + digit_block_4 + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index 96531bde8..923a78c1c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -16,8 +16,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path @@ -85,7 +85,7 @@ def __init__(self): (pynini.accep("오전") + spacing), (pynini.accep("오후") + spacing), (pynini.accep("새벽") + spacing), - (pynini.accep("아침") + spacing) + (pynini.accep("아침") + spacing), ) prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") @@ -102,10 +102,7 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst - #Adding cardinal graph to prevent processing out of range numbers - final_graph = pynini.union( - time_graph, - cardinal_graph - ) + # Adding cardinal graph to prevent processing out of range numbers + final_graph = pynini.union(time_graph, cardinal_graph) - self.fst = self.add_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 2999504c6..3f5943b15 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -25,10 +25,8 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py index 49ec64b7d..45e4c7e2c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py @@ -15,7 +15,12 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_CHAR, GraphFst, delete_space, NEMO_SPACE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_CHAR, + NEMO_SPACE, + GraphFst, + delete_space, +) class MoneyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index acefdb40f..59d1c9809 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -22,10 +22,8 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -65,14 +63,15 @@ def __init__(self): word = WordFst() word_graph = word.fst - graph = pynini.union(cardinal_graph, - ordinal_graph, - decimal_graph, - fraction_graph, - time_graph, - date_graph, - money_graph, - telephone_graph, - word_graph - ) - self.fst = graph \ No newline at end of file + graph = pynini.union( + cardinal_graph, + ordinal_graph, + decimal_graph, + fraction_graph, + time_graph, + date_graph, + money_graph, + telephone_graph, + word_graph, + ) + self.fst = graph