diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv new file mode 100644 index 000000000..fd2127530 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/currency.tsv @@ -0,0 +1,9 @@ +달러 $ +불 $ +유로 € +엔 ¥ +파운드 £ +위안 ¥ +페소 $ +루피 ₹ +원 ₩ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv new file mode 100644 index 000000000..52039ef35 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/months.tsv @@ -0,0 +1,12 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +유 6 +칠 7 +팔 8 +구 9 +시 10 +십일 11 +십이 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv new file mode 100644 index 000000000..8044e4006 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/time/time_hours.tsv @@ -0,0 +1,23 @@ +한 1 +두 2 +세 3 +네 4 +다섯 5 +여섯 6 +일곱 7 +여덟 8 +아홉 9 +열 10 +열한 11 +열두 12 +열세 13 +열네 14 +열다섯 15 +열여섯 16 +열일곱 17 +열여덟 18 +열아홉 19 +스무 20 +스물한 21 +스물두 22 +스물세 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py new file mode 100644 index 000000000..b9de5c299 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } + e.g. 오월 -> date { month: "5" } + e.g. 칠일 -> date { day: "7" } + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="date", kind="classify") + + cardinal = cardinal.just_cardinals + month = pynini.string_file(get_abs_path("data/months.tsv")) + + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + year_suffix = pynini.cross("년", "") + month_suffix = pynini.cross("월", "") + day_suffix = pynini.cross("일", "") + + year_component = ( + pynutil.insert("year: \"") + cardinal + pynini.closure(year_suffix, 0, 1) + pynutil.insert("\"") + ) + + month_component = ( + pynutil.insert("month: \"") + spacing + month + pynini.closure(month_suffix, 0, 1) + pynutil.insert("\"") + ) + + day_component = pynutil.insert("day: \"") + spacing + cardinal + day_suffix + spacing + pynutil.insert("\"") + + graph_component = year_component | month_component | day_component + graph_date = ( + pynini.closure(year_component, 0, 1) + + pynini.closure((pynutil.insert(NEMO_SPACE)) + month_component, 0, 1) + + pynini.closure((pynutil.insert(NEMO_SPACE)) + day_component, 0, 1) + ) + + final_graph = graph_component | graph_date + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py index 181f3ca8f..f7a11e046 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/fraction.py @@ -25,7 +25,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): Fitite state transducer for classifying fractions e.g., fraction { denominator: "사" numerator: "삼" } -> 3/4 - fraction { mixed number: "일" denominator: "사" numerator: "삼" } -> 1 3/4 + fraction { integer_part: "일" denominator: "사" numerator: "삼" } -> 1 3/4 fraction { denominator: "루트삼" numerator: "일" } -> 1/√3 fraction { denominator: "일점육오" numerator: "오십" } -> 50/1.65 fraction { denominator: "이루트육" numerator: "삼" } -> 3/2√6 diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py new file mode 100644 index 000000000..f890e477e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "₩" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="money", kind="classify") + + cardinals = cardinal.just_cardinals + currency = pynini.string_file(get_abs_path("data/currency.tsv")) + + # Accepting space if there are one between integer and currency + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + graph_integer = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + spacing + + graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") + + graph_final = graph_integer + graph_unit + + final_graph = self.add_tokens(graph_final) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py new file mode 100644 index 000000000..fe499a838 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/telephone.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_extra_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying a generic 3-4-4 telephone number. + e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number_part: "010-1234-5678" } + + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_zero_alt = pynini.cross("공", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + + digit = graph_digit | graph_zero | graph_zero_alt + + separator = pynini.cross(pynini.union(" ", "에"), "-").optimize() + + digit_block_3 = digit + digit + digit + digit_block_4 = digit_block_3 + digit + + optional_separator = pynini.closure(separator, 0, 1) + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + phone_number_graph = ( + pynutil.insert('number_part: "') + + digit_block_3 + + optional_separator + + digit_block_4 + + optional_separator + + digit_block_4 + + pynutil.insert('"') + ) + + graph = phone_number_graph + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py new file mode 100644 index 000000000..923a78c1c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -0,0 +1,108 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } + e.g. 12분전 -> time { minutes: "12" suffix: "전" } + e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } + e.g. 두시반 -> time { hours: "2" minutes: "30" } + e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # 1-9 in cardinals for minutes and seconds + cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + cardinal_zero = pynini.cross("영", "0") + + graph_tens_prefix = pynini.union( + pynini.cross("이", "2"), pynini.cross("삼", "3"), pynini.cross("사", "4"), pynini.cross("오", "5") + ) + # Graphing 10-19 + graph_ten = pynini.union(pynini.cross("십", "10"), pynini.cross("십", "1") + cardinal_digit).optimize() + # Graphing 20-59 + graph_tens = (graph_tens_prefix + pynini.cross("십", "0")) | ( + graph_tens_prefix + pynini.cross("십", "") + cardinal_digit + ) + + graph_0_to_59 = pynini.union(cardinal_zero, cardinal_digit, graph_ten, graph_tens).optimize() + + # 1-12 for hours + graph_hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) + # Special expression for 30 minute + graph_half = pynini.cross("반", "30") + + # Adding space if there are one + spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) + + hour_suffix = pynini.cross("시", "") + minute_suffix = pynini.cross("분", "") + second_suffix = pynini.cross("초", "") + + hour_component = pynutil.insert("hours: \"") + (graph_hours + spacing + hour_suffix) + pynutil.insert("\"") + + minute_component = ( + pynutil.insert("minutes: \"") + + pynini.union((graph_0_to_59 + spacing + minute_suffix) | graph_half) + + pynutil.insert("\"") + ) + + second_component = ( + pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"") + ) + + hour = pynini.closure(hour_component, 0, 1) + minute = pynini.closure(delete_space + minute_component, 0, 1) + second = pynini.closure(delete_space + second_component, 0, 1) + + graph_regular = hour + minute + second + + # 오전 = AM, 오후 = PM + prefix_words = pynini.union( + (pynini.accep("오전") + spacing), + (pynini.accep("오후") + spacing), + (pynini.accep("새벽") + spacing), + (pynini.accep("아침") + spacing), + ) + prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") + + # 전 = before, 후 = after + suffix_words = pynini.accep("전") | pynini.accep("후") + suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") + + time_graph = ( + pynini.closure(delete_space + prefix_tag, 0, 1) + + graph_regular + + pynini.closure(delete_space + suffix_tag, 0, 1) + ) + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + # Adding cardinal graph to prevent processing out of range numbers + final_graph = pynini.union(time_graph, cardinal_graph) + + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 3406e60f1..3f5943b15 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -20,9 +20,13 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -69,6 +73,19 @@ def __init__( fraction = FractionFst(cardinal, decimal) fraction_graph = fraction.fst + + time = TimeFst() + time_graph = time.fst + + date = DateFst(cardinal) + date_graph = date.fst + + money = MoneyFst(cardinal) + money_graph = money.fst + + telephone = TelephoneFst() + telephone_graph = telephone.fst + word_graph = WordFst().fst classify = ( @@ -76,6 +93,10 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.0) + | pynutil.add_weight(time_graph, 1.0) + | pynutil.add_weight(date_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py new file mode 100644 index 000000000..88ed973df --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/date.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } + e.g. 오월 -> date { month: "5" } + e.g. 칠일 -> date { day: "7" } + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + year_component = ( + pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("년") + pynutil.delete("\"") + ) + month_component = ( + pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("월") + pynutil.delete("\"") + ) + day_component = ( + pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("일") + pynutil.delete("\"") + ) + + graph = ( + pynini.closure(pynutil.delete(NEMO_SPACE) + year_component, 0, 1) + + pynini.closure(pynutil.delete(NEMO_SPACE) + month_component, 0, 1) + + pynini.closure(pynutil.delete(NEMO_SPACE) + day_component, 0, 1) + ) + + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py new file mode 100644 index 000000000..45e4c7e2c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/money.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_CHAR, + NEMO_SPACE, + GraphFst, + delete_space, +) + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "원" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self): + super().__init__(name="money", kind="verbalize") + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete('"') + ) + + unit = ( + pynutil.delete("currency:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete('"') + ) + + optional_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1).optimize() + + graph = unit + optional_space + integer + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py new file mode 100644 index 000000000..3ac213b59 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/telephone.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying a generic 3-4-4 telephone number. + e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number: "010-1234-5678" } + + """ + + def __init__(self): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete('number_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + delete_tokens = self.delete_tokens(number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py new file mode 100644 index 000000000..4b63ade99 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/time.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } + e.g. 12분전 -> time { minutes: "12" suffix: "전" } + e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } + e.g. 두시반 -> time { hours: "2" minutes: "30" } + e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hours_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + minutes_component = pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + seconds_component = pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + prefix_component = pynutil.delete("prefix: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + # Add a leading zero to single-digit minutes/seconds + single_digit = NEMO_DIGIT + leading_zero = pynutil.insert("0") + single_digit + add_leading_zero = pynini.union(single_digit @ leading_zero, pynini.closure(NEMO_DIGIT, 2)) + + minutes = minutes_component @ add_leading_zero + seconds = seconds_component @ add_leading_zero + + # Defining all the possible combinations + path_h = hours_component + pynutil.insert(":00") + path_m = minutes + path_s = seconds + + path_hm = hours_component + delete_space + pynutil.insert(":") + minutes + path_hs = ( + hours_component + + delete_space + + pynutil.insert(":") + + pynutil.insert("00") + + delete_space + + pynutil.insert(":") + + seconds + ) + path_ms = minutes + delete_space + pynutil.insert(":") + seconds + + path_hms = ( + hours_component + + delete_space + + pynutil.insert(":") + + minutes + + delete_space + + pynutil.insert(":") + + seconds + ) + + time_graph = pynini.union(path_h, path_m, path_s, path_hm, path_hs, path_ms, path_hms) + + # Adding prefix and suffix space + optional_prefix_out = pynini.closure(delete_space + prefix_component, 0, 1) + optional_suffix_out = pynini.closure(delete_space + pynutil.insert(NEMO_SPACE) + suffix_component, 0, 1) + + final_graph = optional_prefix_out + time_graph + optional_suffix_out + self.fst = self.delete_tokens(delete_space + final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 8561b8631..59d1c9809 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -12,12 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pynini +from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.decimal import DecimalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -42,5 +48,30 @@ def __init__(self): fraction = FractionFst() fraction_graph = fraction.fst - graph = cardinal_graph | ordinal_graph | decimal_graph | fraction_graph + time = TimeFst() + time_graph = time.fst + + date = DateFst() + date_graph = date.fst + + money = MoneyFst() + money_graph = money.fst + + telephone = TelephoneFst() + telephone_graph = telephone.fst + + word = WordFst() + word_graph = word.fst + + graph = pynini.union( + cardinal_graph, + ordinal_graph, + decimal_graph, + fraction_graph, + time_graph, + date_graph, + money_graph, + telephone_graph, + word_graph, + ) self.fst = graph diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..ecad6dc19 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,16 @@ +이천이십사년팔월이십팔일~2024년8월28일 +이천이십삼년 구월 오일~2023년 9월 5일 +천구백구십구년십이월삼십일일~1999년12월31일 +이천년 이월 이십구일~2000년 2월 29일 +이천십년시월십일~2010년10월10일 +이천이십일년유월십육일~2021년6월16일 +이천삼십년삼월십사일~2030년3월14일 +천구백팔십팔년 오월 이십일~1988년 5월 20일 +이천일년 칠월 구일~2001년 7월 9일 +이천십팔년사월삼십일~2018년4월30일 +삼천년팔월십오일~3000년8월15일 +이천구년 일월이십일~2009년 1월20일 +이천삼십오년~2035년 +오월~5월 +이십사일~24일 +구천구백구십구년삼월일일~9999년3월1일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..09c6b2841 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,29 @@ +오천 원~₩5000 +만오천원~₩15000 +십이만삼천 원~₩123000 +백 원~₩100 +천백십일 원~₩1111 +육십만 원~₩600000 +백만 원~₩1000000 +삼백오십만 원~₩3500000 +천이백만 원~₩12000000 +일억 원~₩100000000 +십이억오천만 원~₩1250000000 +백억 원~₩10000000000 +오천억원~₩500000000000 +일조 원~₩1000000000000 +삼조오천억 원~₩3500000000000 +영원~₩0 +구십구 원~₩99 +만 원~₩10000 +일만 원~₩10000 +십오 달러~$15 +이십불~$20 +천오백 불~$1500 +백만 달러~$1000000 +오십 유로~€50 +천 엔~¥1000 +만 엔~¥10000 +백 파운드~£100 +이십 위안~¥20 +구천구백구십구원~₩9999 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..6702d4ddd --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,11 @@ +공일공에 일이삼사에 오육칠팔~010-1234-5678 +영일영 구팔칠육 오사삼이~010-9876-5432 +공이에 삼사오육에 칠팔구공~02-3456-7890 +공삼일에 구팔칠에 육오사삼~031-987-6543 +천오백팔십팔에 이공공공~1588-2000 +천오백칠십칠에 구천번~1577-9000 +일일구~119 +일일이~112 +공일공 일이삼사 오육칠팔~010-1234-5678 +공이 삼사오에 육칠팔구~02-345-6789 +공일공일이삼사오육칠팔~010-1234-5678 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..450039132 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,15 @@ +두시~2:00 +열두시~12:00 +두시 삼십분~2:30 +세시 삼분~3:03 +두시 반~2:30 +열두시 반~12:30 +두시 오초~2:00:05 +두시 삼십분 오초~2:30:05 +오전두시~오전2:00 +오후네시반~오후4:30 +두시전~2:00 전 +두시십분후~2:10 후 +한시 십오분 삼십초~1:15:30 +네시 이분~4:02 +열한시 오십구분~11:59 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_date.py b/tests/nemo_text_processing/ko/test_date.py new file mode 100644 index 000000000..f26513a15 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_date.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_money.py b/tests/nemo_text_processing/ko/test_money.py new file mode 100644 index 000000000..881a1ee4e --- /dev/null +++ b/tests/nemo_text_processing/ko/test_money.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh index a63c08f84..816678de3 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -42,6 +42,26 @@ testITNFraction() { runtest $input } +testITNTime() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNDate() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNMoney() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNTelephone() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + # Remove all command-line arguments shift $# diff --git a/tests/nemo_text_processing/ko/test_telephone.py b/tests/nemo_text_processing/ko/test_telephone.py new file mode 100644 index 000000000..4a3684896 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_telephone.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/ko/test_time.py b/tests/nemo_text_processing/ko/test_time.py new file mode 100644 index 000000000..c5e0f71d3 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_time.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected