Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pipeline {
HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-16-25-0'
SV_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/11-13-24-0'
IT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-22-24-0'
Expand Down Expand Up @@ -171,7 +171,7 @@ pipeline {
}
}

stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') {
stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') {
when {
anyOf {
branch 'main'
Expand All @@ -197,6 +197,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: VI TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: HU TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ $ đô la mỹ
₩ won
₩ uôn
RM ringgit
₫ đồng
£ bảng anh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- gạch
_ gạch dưới
_ shift gạch
_ shift trừ
_ síp gạch
! chấm than
# thăng
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
. chấm
- gạch
- gạch ngang
_ gạch dưới
_ shift gạch
_ shift trừ
_ síp gạch
/ sẹc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, '"').optimize()

NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,117 +36,118 @@ class CardinalFst(GraphFst):

def __init__(self):
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

thousand_words = pynini.union("ngàn", "nghìn")
negative_words = pynini.union("âm", "trừ")

graph_hundred = pynini.cross("trăm", "")
graph_ten = pynini.cross("mươi", "")
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")

graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_one = pynini.cross("mốt", "1")
graph_four = pynini.cross("tư", "4")
graph_five = pynini.cross("lăm", "5")
graph_half = pynini.cross("rưỡi", "5")
graph_hundred = pynini.cross("trăm", "")
graph_ten = pynini.cross("mươi", "")
zero = pynini.cross(pynini.union("linh", "lẻ"), "0")

optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)
last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
last_digit = pynini.union(
self.last_digit = pynini.union(
(pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit,
graph_one,
graph_four,
graph_five,
)

graph_hundred_ties_component = (graph_digit | graph_zero) + delete_space + graph_hundred
graph_hundred_ties_component += delete_space
graph_hundred_ties_component += pynini.union(
last_digit = self.last_digit
# Build hundreds component (e.g., "một trăm", "hai trăm")
graph_hundreds_component = (graph_digit | graph_zero) + delete_space + graph_hundred
graph_hundreds_component += delete_space
graph_hundreds_component += pynini.union(
graph_teen,
(graph_half | graph_four | graph_one) + pynutil.insert("0"),
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")),
zero + delete_space + (graph_digit | graph_four),
pynutil.insert("00"),
)
graph_hundred_ties_component |= (
(graph_half | graph_four | graph_one) + pynutil.insert("0", weight=0.1),
graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0", weight=0.1)),
zero + delete_space + (graph_digit | graph_four | graph_five),
pynutil.insert("00", weight=0.1),
).optimize()
graph_hundreds_component |= (
pynutil.insert("0")
+ delete_space
+ pynini.union(
graph_teen,
graph_ties + optional_ten + delete_space + last_digit,
graph_ties + delete_space + graph_ten + pynutil.insert("0"),
zero + delete_space + (graph_digit | graph_four),
)
graph_ties + delete_space + graph_ten + pynutil.insert("0", weight=0.1),
zero + delete_space + (graph_digit | graph_four | graph_five),
).optimize()
)
graph_hundred_component = graph_hundreds_component | (
pynutil.insert("00", weight=0.1) + delete_space + graph_digit
)
graph_hundred_component = graph_hundred_ties_component | (pynutil.insert("00") + delete_space + graph_digit)

graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
)
self.graph_hundred_component_at_least_one_none_zero_digit = (
graph_hundred_component_at_least_one_none_zero_digit
graph_hundred_component_at_least_one_none_zero_digit.optimize()
)
graph_hundred_ties_zero = graph_hundred_ties_component | pynutil.insert("000")
graph_hundreds_zero = graph_hundreds_component | pynutil.insert("000", weight=0.1)

graph_thousands = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete(pynini.union("nghìn", "ngàn")),
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(thousand_words),
pynutil.insert("000", weight=0.1),
)

graph_ten_thousand = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"),
pynutil.insert("0000", weight=0.1),
)

graph_ten_thousand_suffix = pynini.union(
graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")),
pynutil.insert("0", weight=0.1),
)
).optimize()

graph_million = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"),
pynutil.insert("000", weight=0.1),
)
).optimize()
graph_billion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete(pynini.union("tỉ", "tỷ")),
pynutil.insert("000", weight=0.1),
)
).optimize()

# Main graph combining all magnitude levels
graph = pynini.union(
# Full format: billion + million + thousand + hundred
graph_billion
+ delete_space
+ graph_million
+ delete_space
+ graph_thousands
+ delete_space
+ graph_hundred_ties_zero,
graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero,
+ graph_hundreds_zero,
# Special thousand format with last digit or "rưỡi" (half)
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete(pynini.union("nghìn", "ngàn"))
+ pynutil.delete(thousand_words)
+ delete_space
+ (((last_digit | graph_half) + pynutil.insert("00")) | graph_hundred_ties_zero),
+ (((last_digit | graph_half) + pynutil.insert("00", weight=0.1)) | graph_hundreds_zero),
# Single digits (for non-exception cases)
graph_digit,
graph_zero,
)

graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
"0",
graph = (
graph
@ pynini.union(
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT),
"0",
).optimize()
)

# don't convert cardinals from zero to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
single_digits = pynini.project(pynini.union(graph_digit, graph_zero), "input").optimize()

self.graph_no_exception = graph

self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
self.graph = pynini.difference(pynini.project(graph, "input"), single_digits) @ graph

optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE,
pynutil.insert("negative: ") + pynini.cross(negative_words, '"-"') + NEMO_SPACE,
0,
1,
)
Expand Down
Loading
Loading