-
Notifications
You must be signed in to change notification settings - Fork 135
Korean ITN Time #317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: ko_itn_staging_v1
Are you sure you want to change the base?
Korean ITN Time #317
Changes from all commits
fa304a2
77da79d
9f7e876
f893d89
4df2965
41ac59d
a5164dc
7842d13
858a051
b95f5fb
98f80ed
9a00ba6
63ce43d
356a68e
473f042
6d25ac9
ad7c91e
2c4574b
bc73bb7
6f61132
68a6906
1e31335
33e02d5
53dc07b
fcfc6c5
14600aa
9fc941f
688c84f
596bac3
9b4c35f
78b726b
b7852af
736ca34
9f01195
2d51a39
19dbc24
36228e4
e5252d1
46ee848
c58e4a5
8e31004
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| 달러 $ | ||
| 불 $ | ||
| 유로 € | ||
| 엔 ¥ | ||
| 파운드 £ | ||
| 위안 ¥ | ||
| 페소 $ | ||
| 루피 ₹ | ||
| 원 ₩ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| 일 1 | ||
| 이 2 | ||
| 삼 3 | ||
| 사 4 | ||
| 오 5 | ||
| 유 6 | ||
| 칠 7 | ||
| 팔 8 | ||
| 구 9 | ||
| 시 10 | ||
| 십일 11 | ||
| 십이 12 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| 한 1 | ||
| 두 2 | ||
| 세 3 | ||
| 네 4 | ||
| 다섯 5 | ||
| 여섯 6 | ||
| 일곱 7 | ||
| 여덟 8 | ||
| 아홉 9 | ||
| 열 10 | ||
| 열한 11 | ||
| 열두 12 | ||
| 열세 13 | ||
| 열네 14 | ||
| 열다섯 15 | ||
| 열여섯 16 | ||
| 열일곱 17 | ||
| 열여덟 18 | ||
| 열아홉 19 | ||
| 스무 20 | ||
| 스물한 21 | ||
| 스물두 22 | ||
| 스물세 23 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path | ||
|
|
||
|
|
||
| class DateFst(GraphFst): | ||
| """ | ||
| Finite state transducer for classifying date, | ||
| e.g. 이천십이년 일월 오일 -> date { year: "2012" month: "1" day: "5" } | ||
| e.g. 오월 -> date { month: "5" } | ||
| e.g. 칠일 -> date { day: "7" } | ||
| """ | ||
|
|
||
| def __init__(self, cardinal: GraphFst): | ||
| super().__init__(name="date", kind="classify") | ||
|
|
||
| cardinal = cardinal.just_cardinals | ||
| month = pynini.string_file(get_abs_path("data/months.tsv")) | ||
|
|
||
| spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) | ||
|
|
||
| year_suffix = pynini.cross("년", "") | ||
| month_suffix = pynini.cross("월", "") | ||
| day_suffix = pynini.cross("일", "") | ||
|
|
||
| year_component = ( | ||
| pynutil.insert("year: \"") + cardinal + pynini.closure(year_suffix, 0, 1) + pynutil.insert("\"") | ||
| ) | ||
|
|
||
| month_component = ( | ||
| pynutil.insert("month: \"") + spacing + month + pynini.closure(month_suffix, 0, 1) + pynutil.insert("\"") | ||
| ) | ||
|
|
||
| day_component = pynutil.insert("day: \"") + spacing + cardinal + day_suffix + spacing + pynutil.insert("\"") | ||
|
|
||
| graph_component = year_component | month_component | day_component | ||
| graph_date = ( | ||
| pynini.closure(year_component, 0, 1) | ||
| + pynini.closure((pynutil.insert(NEMO_SPACE)) + month_component, 0, 1) | ||
| + pynini.closure((pynutil.insert(NEMO_SPACE)) + day_component, 0, 1) | ||
| ) | ||
|
|
||
| final_graph = graph_component | graph_date | ||
|
|
||
| final_graph = self.add_tokens(final_graph) | ||
| self.fst = final_graph.optimize() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path | ||
|
|
||
|
|
||
| class MoneyFst(GraphFst): | ||
| """ | ||
| Finite state transducer for classifying money | ||
| e.g. 오만 삼천원 -> money { integer_part: "53000" currency: "₩" } | ||
|
|
||
| Args: | ||
| cardinal: CardinalFst | ||
| """ | ||
|
|
||
| def __init__(self, cardinal: GraphFst): | ||
| super().__init__(name="money", kind="classify") | ||
|
|
||
| cardinals = cardinal.just_cardinals | ||
| currency = pynini.string_file(get_abs_path("data/currency.tsv")) | ||
|
|
||
| # Accepting space if there are one between integer and currency | ||
| spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) | ||
|
|
||
| graph_integer = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + spacing | ||
|
|
||
| graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") | ||
|
|
||
| graph_final = graph_integer + graph_unit | ||
|
|
||
| final_graph = self.add_tokens(graph_final) | ||
| self.fst = final_graph.optimize() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_extra_space | ||
| from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path | ||
|
|
||
|
|
||
| class TelephoneFst(GraphFst): | ||
| """ | ||
| Finite state transducer for classifying a generic 3-4-4 telephone number. | ||
| e.g. 공일공에 일이삼사에 오육칠팔 -> telephone { number_part: "010-1234-5678" } | ||
| """ | ||
|
|
||
| def __init__(self): | ||
| super().__init__(name="telephone", kind="classify") | ||
| graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) | ||
| graph_zero_alt = pynini.cross("공", "0") | ||
| graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | ||
|
|
||
| digit = graph_digit | graph_zero | graph_zero_alt | ||
|
|
||
| separator = pynini.cross(pynini.union(" ", "에"), "-").optimize() | ||
|
|
||
| digit_block_3 = digit + digit + digit | ||
hmlee245 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| digit_block_4 = digit_block_3 + digit | ||
hmlee245 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| optional_separator = pynini.closure(separator, 0, 1) | ||
| spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) | ||
|
|
||
| phone_number_graph = ( | ||
| pynutil.insert('number_part: "') | ||
| + digit_block_3 | ||
| + optional_separator | ||
| + digit_block_4 | ||
| + optional_separator | ||
| + digit_block_4 | ||
| + pynutil.insert('"') | ||
| ) | ||
|
|
||
| graph = phone_number_graph | ||
| final_graph = self.add_tokens(graph) | ||
| self.fst = final_graph.optimize() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path | ||
|
|
||
|
|
||
| class TimeFst(GraphFst): | ||
| """ | ||
| Finite state transducer for classifying time | ||
| e.g. 열두시 삼십분 -> time { hours: "12" minutes: "30" } | ||
| e.g. 12분전 -> time { minutes: "12" suffix: "전" } | ||
| e.g. 새벽 두시 -> time { hours: "2" suffix: "새벽" } | ||
| e.g. 두시반 -> time { hours: "2" minutes: "30" } | ||
| e.g. 오후 두시반 -> time { prefix: "오후" hours: "2" minutes: "30" } | ||
| """ | ||
|
|
||
| def __init__(self): | ||
| super().__init__(name="time", kind="classify") | ||
|
|
||
| # 1-9 in cardinals for minutes and seconds | ||
| cardinal_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | ||
| cardinal_zero = pynini.cross("영", "0") | ||
|
|
||
| graph_tens_prefix = pynini.union( | ||
| pynini.cross("이", "2"), pynini.cross("삼", "3"), pynini.cross("사", "4"), pynini.cross("오", "5") | ||
| ) | ||
| # Graphing 10-19 | ||
| graph_ten = pynini.union(pynini.cross("십", "10"), pynini.cross("십", "1") + cardinal_digit).optimize() | ||
| # Graphing 20-59 | ||
| graph_tens = (graph_tens_prefix + pynini.cross("십", "0")) | ( | ||
| graph_tens_prefix + pynini.cross("십", "") + cardinal_digit | ||
| ) | ||
|
|
||
| graph_0_to_59 = pynini.union(cardinal_zero, cardinal_digit, graph_ten, graph_tens).optimize() | ||
|
|
||
| # 1-12 for hours | ||
| graph_hours = pynini.string_file(get_abs_path("data/time/time_hours.tsv")) | ||
| # Special expression for 30 minute | ||
| graph_half = pynini.cross("반", "30") | ||
|
|
||
| # Adding space if there are one | ||
| spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) | ||
|
|
||
| hour_suffix = pynini.cross("시", "") | ||
| minute_suffix = pynini.cross("분", "") | ||
| second_suffix = pynini.cross("초", "") | ||
|
|
||
| hour_component = pynutil.insert("hours: \"") + (graph_hours + spacing + hour_suffix) + pynutil.insert("\"") | ||
|
|
||
| minute_component = ( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. won't this graph beyond 0-59 though? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does only accept 0-59 properly. Anything beyond will be accepted awkwardly. For example, "60분" will be tokenized as Cardinal 6, minute 10. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a block to that to limit awkward examples? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's what I've been working on this week + money semiotic class. Will try to update those asap this week. |
||
| pynutil.insert("minutes: \"") | ||
| + pynini.union((graph_0_to_59 + spacing + minute_suffix) | graph_half) | ||
| + pynutil.insert("\"") | ||
| ) | ||
|
|
||
| second_component = ( | ||
| pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"") | ||
| ) | ||
|
|
||
| hour = pynini.closure(hour_component, 0, 1) | ||
| minute = pynini.closure(delete_space + minute_component, 0, 1) | ||
| second = pynini.closure(delete_space + second_component, 0, 1) | ||
|
|
||
| graph_regular = hour + minute + second | ||
|
|
||
| # 오전 = AM, 오후 = PM | ||
| prefix_words = pynini.union( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make union (....) + spacing. Optimization usually catches these but it's not a given so might as well safe the quick op. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure how to union the pynini.accep. It gives me TypeError for str, tuple issue. |
||
| (pynini.accep("오전") + spacing), | ||
| (pynini.accep("오후") + spacing), | ||
| (pynini.accep("새벽") + spacing), | ||
| (pynini.accep("아침") + spacing), | ||
| ) | ||
| prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") | ||
|
|
||
| # 전 = before, 후 = after | ||
| suffix_words = pynini.accep("전") | pynini.accep("후") | ||
| suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") | ||
|
|
||
| time_graph = ( | ||
| pynini.closure(delete_space + prefix_tag, 0, 1) | ||
| + graph_regular | ||
| + pynini.closure(delete_space + suffix_tag, 0, 1) | ||
| ) | ||
|
|
||
| cardinal = CardinalFst() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make cardinal fst an argument for the init function. this allow you to pass the fst from the cardinal in the tagger graph and avoid having to instantiate the graph twice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not using the cardinal fst for anything else and only to detect hour/minute/second that are out of normal range. So, wouldn't this be instantiating once? |
||
| cardinal_graph = cardinal.fst | ||
|
|
||
| # Adding cardinal graph to prevent processing out of range numbers | ||
| final_graph = pynini.union(time_graph, cardinal_graph) | ||
|
|
||
| self.fst = self.add_tokens(final_graph).optimize() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,9 +20,13 @@ | |
|
|
||
| from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.fraction import FractionFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.money import MoneyFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst | ||
| from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst | ||
|
|
||
|
|
||
|
|
@@ -69,13 +73,30 @@ def __init__( | |
|
|
||
| fraction = FractionFst(cardinal, decimal) | ||
| fraction_graph = fraction.fst | ||
|
|
||
| time = TimeFst() | ||
| time_graph = time.fst | ||
|
|
||
| date = DateFst(cardinal) | ||
| date_graph = date.fst | ||
|
|
||
| money = MoneyFst(cardinal) | ||
| money_graph = money.fst | ||
|
|
||
| telephone = TelephoneFst() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pass cardinal to your telephone like the above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am actually not using cardinal for telephone. I am just using the same digits for cardinal class. |
||
| telephone_graph = telephone.fst | ||
|
|
||
| word_graph = WordFst().fst | ||
|
|
||
| classify = ( | ||
| pynutil.add_weight(cardinal_graph, 1.1) | ||
| | pynutil.add_weight(ordinal_graph, 1.1) | ||
| | pynutil.add_weight(decimal_graph, 1.1) | ||
| | pynutil.add_weight(fraction_graph, 1.0) | ||
| | pynutil.add_weight(time_graph, 1.0) | ||
| | pynutil.add_weight(date_graph, 1.1) | ||
| | pynutil.add_weight(money_graph, 1.1) | ||
| | pynutil.add_weight(telephone_graph, 1.1) | ||
| | pynutil.add_weight(word_graph, 100) | ||
| ) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.