ming024 · DanojaDias · Aug 28, 2022
diff --git a/config/AISHELL3/preprocess.yaml b/config/AISHELL3/preprocess.yaml
@@ -13,7 +13,7 @@ preprocessing:
     language: "zh"
   audio:
     sampling_rate: 22050
-    max_wav_value: 32768.0
+    max_wav_value: 32767.0
   stft:
     filter_length: 1024
     hop_length: 256

diff --git a/config/LJSpeech/preprocess.yaml b/config/LJSpeech/preprocess.yaml
@@ -13,7 +13,7 @@ preprocessing:
     language: "en"
   audio:
     sampling_rate: 22050
-    max_wav_value: 32768.0
+    max_wav_value: 32767.0
   stft:
     filter_length: 1024
     hop_length: 256

diff --git a/config/LJSpeech_paper/preprocess.yaml b/config/LJSpeech_paper/preprocess.yaml
@@ -13,7 +13,7 @@ preprocessing:
     language: "en"
   audio:
     sampling_rate: 22050
-    max_wav_value: 32768.0
+    max_wav_value: 32767.0
   stft:
     filter_length: 1024
     hop_length: 256

diff --git a/config/LibriTTS/preprocess.yaml b/config/LibriTTS/preprocess.yaml
@@ -13,7 +13,7 @@ preprocessing:
     language: "en"
   audio:
     sampling_rate: 22050
-    max_wav_value: 32768.0
+    max_wav_value: 32767.0
   stft:
     filter_length: 1024
     hop_length: 256

diff --git a/config/Sinhala/model.yaml b/config/Sinhala/model.yaml
@@ -0,0 +1,37 @@
+transformer:
+  encoder_layer: 4
+  encoder_head: 2
+  encoder_hidden: 256
+  decoder_layer: 6
+  decoder_head: 2
+  decoder_hidden: 256
+  conv_filter_size: 1024
+  conv_kernel_size: [9, 1]
+  encoder_dropout: 0.2
+  decoder_dropout: 0.2
+
+variance_predictor:
+  filter_size: 256
+  kernel_size: 3
+  dropout: 0.5
+
+variance_embedding:
+  pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
+  energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
+  n_bins: 256
+
+# gst:
+#   use_gst: False
+#   conv_filters: [32, 32, 64, 64, 128, 128]
+#   gru_hidden: 128
+#   token_size: 128
+#   n_style_token: 10
+#   attn_head: 4
+
+multi_speaker: False
+
+max_seq_len: 12000
+
+vocoder:
+  model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN'
+  speaker: "universal" # support  'LJSpeech', 'universal'
diff --git a/config/Sinhala/preprocess.yaml b/config/Sinhala/preprocess.yaml
@@ -0,0 +1,30 @@
+dataset: "Sinhala"
+
+path:
+  corpus_path: "/home/danoja/MSC/Project/implementation/Data/Sinhala"
+  lexicon_path: "/home/danoja/MSC/Project/implementation/DanojaDias_FastSpeech2/FastSpeech2/lexicon/sinhala-lexicon.txt"
+  raw_path: "./raw_data/Sinhala"
+  preprocessed_path: "/home/danoja/MSC/Project/implementation/DanojaDias_FastSpeech2/FastSpeech2/preprocessed_data/Sinhala"
+
+preprocessing:
+  val_size: 512
+  text:
+    text_cleaners: ["sinhala_cleaners"]
+    language: "si"
+  audio:
+    sampling_rate: 22050
+    max_wav_value: 32767.0
+  stft:
+    filter_length: 1024
+    hop_length: 256
+    win_length: 1024
+  mel:
+    n_mel_channels: 80
+    mel_fmin: 0
+    mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
+  pitch:
+    feature: "frame_level" # support 'phoneme_level' or 'frame_level'
+    normalization: True
+  energy:
+    feature: "frame_level" # support 'phoneme_level' or 'frame_level'
+    normalization: True
diff --git a/config/Sinhala/train.yaml b/config/Sinhala/train.yaml
@@ -0,0 +1,20 @@
+path:
+  ckpt_path: "./output/ckpt/Sinhala"
+  log_path: "./output/log/Sinhala"
+  result_path: "./output/result/Sinhala"
+optimizer:
+  batch_size: 2
+  betas: [0.9, 0.98]
+  eps: 0.000000001
+  weight_decay: 0.0
+  grad_clip_thresh: 1.0
+  grad_acc_step: 1
+  warm_up_step: 4000
+  anneal_steps: [300000, 400000, 500000]
+  anneal_rate: 0.3
+step:
+  total_step: 900000
+  log_step: 100
+  synth_step: 1000
+  val_step: 1000
+  save_step: 50000