From 461f668c1dc8f6d660333325ffc7827f1533cdc9 Mon Sep 17 00:00:00 2001 From: Haoran Wang <2472356642@qq.com> Date: Tue, 17 Jun 2025 19:35:09 -0500 Subject: [PATCH 1/6] add faster-whisper wer --- versa/__init__.py | 4 ++++ versa/scorer_shared.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/versa/__init__.py b/versa/__init__.py index 3210249..8471d37 100644 --- a/versa/__init__.py +++ b/versa/__init__.py @@ -55,6 +55,10 @@ whisper_levenshtein_metric, whisper_wer_setup, ) +from versa.corpus_metrics.faster_whisper_wer import ( + faster_whisper_levenshtein_metric, + faster_whisper_wer_setup, +) from versa.utterance_metrics.asr_matching import asr_match_metric, asr_match_setup from versa.utterance_metrics.audiobox_aesthetics_score import ( audiobox_aesthetics_score, diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index fd74a31..f9d35e7 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -361,7 +361,35 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal "args": args_cache, } logging.info("Initiate Whisper WER calculation successfully") + elif config["name"] == "faster_whisper_wer": + if not use_gt_text: + logging.warning("Cannot use faster_whisper_wer because no gt text is provided") + continue + + logging.info("Loading faster_whisper_wer metric with reference text") + from versa import faster_whisper_levenshtein_metric, faster_whisper_wer_setup + + # Load whisper model if it is already loaded + if ( + "speaking_rate" in score_modules.keys() + or "asr_matching" in score_modules.keys() + ): + args_cache = score_modules["speaking_rate"]["args"] + else: + args_cache = faster_whisper_wer_setup( + model_tag=config.get("model_tag", "default"), + beam_size=config.get("beam_size", 1), + batch_size=config.get("batch_size", 1), + compute_type=config.get("compute_type", "float32"), + text_cleaner=config.get("text_cleaner", "whisper_basic"), + use_gpu=use_gpu, + ) + score_modules["faster_whisper_wer"] = { + "module": faster_whisper_levenshtein_metric, + "args": args_cache, + } + logging.info("Initiate faster_whisper WER calculation successfully") elif config["name"] == "scoreq_ref": if not use_gt: logging.warning("Cannot use scoreq_ref because no gt audio is provided") @@ -1001,7 +1029,7 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None): score = score_modules[key]["module"]( score_modules[key]["model"], gen_wav, gt_wav, gen_sr ) - elif key == "espnet_wer" or key == "owsm_wer" or key == "whisper_wer": + elif key == "espnet_wer" or key == "owsm_wer" or key == "whisper_wer" or key == "faster_whisper_wer": score = score_modules[key]["module"]( score_modules[key]["args"], gen_wav, From db72a2706eefc9a5fa9408bb05b533efa2e79f16 Mon Sep 17 00:00:00 2001 From: Haoran Wang <2472356642@qq.com> Date: Mon, 23 Jun 2025 13:13:27 -0500 Subject: [PATCH 2/6] add: three metrics --- .gitignore | 2 + egs/separate_metrics/wer.yaml | 56 +++++++- setup.py | 2 + test/test_pipeline/test_general.py | 5 +- test/test_pipeline/test_wer.py | 59 +++----- test/test_samples/test_wer/test_wer.wav | Bin 0 -> 18604 bytes test/test_samples/text_wer | 1 + tools/install_faster-whisper.sh | 23 ++++ versa/__init__.py | 8 ++ versa/corpus_metrics/faster_whisper_wer.py | 149 +++++++++++++++++++++ versa/corpus_metrics/hubert_wer.py | 141 +++++++++++++++++++ versa/corpus_metrics/nemo_wer.py | 137 +++++++++++++++++++ versa/metrics.py | 45 ++++++- versa/scorer_shared.py | 54 +++++++- 14 files changed, 634 insertions(+), 48 deletions(-) create mode 100644 test/test_samples/test_wer/test_wer.wav create mode 100644 test/test_samples/text_wer create mode 100644 tools/install_faster-whisper.sh create mode 100644 versa/corpus_metrics/faster_whisper_wer.py create mode 100644 versa/corpus_metrics/hubert_wer.py create mode 100644 versa/corpus_metrics/nemo_wer.py diff --git a/.gitignore b/.gitignore index 108609f..cc6b13e 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,5 @@ fadtk/ scoreq/ fairseq/ UTMOSv2/ +versa_cache/ +hub/ \ No newline at end of file diff --git a/egs/separate_metrics/wer.yaml b/egs/separate_metrics/wer.yaml index 1d4847e..d636450 100644 --- a/egs/separate_metrics/wer.yaml +++ b/egs/separate_metrics/wer.yaml @@ -41,7 +41,7 @@ # More model_tag can be from https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages . # The default model is `large-v3`. # NOTE(jiatong): further aggregation are necessary for corpus-level WER/CER -# --whisper_hyp_text: the hypothesis from ESPnet ASR decoding +# --whisper_hyp_text: the hypothesis from Whisper ASR decoding # --ref_text: reference text (after cleaner) # --whisper_wer_delete: delete errors # --whisper_wer_insert: insertion errors @@ -54,4 +54,58 @@ - name: whisper_wer model_tag: default beam_size: 5 + text_cleaner: whisper_basic + + +# Word error rate with faster-whisper model +# More model_tag can be from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/utils.py . +# The default model is `large-v3`. +# --faster_whisper_hyp_text: the hypothesis from faster-whisper ASR decoding +# --ref_text: reference text (after cleaner) +# --faster_whisper_wer_delete: delete errors +# --faster_whisper_wer_insert: insertion errors +# --faster_whisper_wer_replace: replacement errors +# --faster_whisper_wer_equal: correct matching words/character counts +# --faster_whisper_cer_delete: delete errors +# --faster_whisper_cer_insert: insertion errors +# --faster_whisper_cer_replace: replacement errors +# --faster_whisper_cer_equal: correct matching words/character counts +- name: faster_whisper_wer + model_tag: default + beam_size: 5 + batch_size: 1 + compute_type: float32 + text_cleaner: whisper_basic + +# Word error rate with NeMo asr model +# The default model is `nvidia/stt_en_conformer_transducer_xlarge`. +# --nemo_hyp_text: the hypothesis from NeMo ASR decoding +# --ref_text: reference text (after cleaner) +# --nemo_wer_delete: delete errors +# --nemo_wer_insert: insertion errors +# --nemo_wer_replace: replacement errors +# --nemo_wer_equal: correct matching words/character counts +# --nemo_cer_delete: delete errors +# --nemo_cer_insert: insertion errors +# --nemo_cer_replace: replacement errors +# --nemo_cer_equal: correct matching words/character counts +- name: nemo_wer + model_tag: default + text_cleaner: whisper_basic + + +# Word error rate with Hubert-Large-Finetuned model +# The default model is `facebook/hubert-large-ls960-ft`. +# --hubert_hyp_text: the hypothesis from Hubert ASR decoding +# --ref_text: reference text (after cleaner) +# --hubert_wer_delete: delete errors +# --hubert_wer_insert: insertion errors +# --hubert_wer_replace: replacement errors +# --hubert_wer_equal: correct matching words/character counts +# --hubert_cer_delete: delete errors +# --hubert_cer_insert: insertion errors +# --hubert_cer_replace: replacement errors +# --hubert_cer_equal: correct matching words/character counts +- name: hubert_wer + model_tag: default text_cleaner: whisper_basic \ No newline at end of file diff --git a/setup.py b/setup.py index 92a2c68..73a719c 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ "importlib-metadata", "kaggle", "kaldiio", + "jamo", "lazy_loader", "Levenshtein", "librosa", @@ -58,6 +59,7 @@ "espnet_model_zoo", "discrete-speech-metrics @ git+https://github.com/ftshijt/DiscreteSpeechMetrics.git@v1.0.2", "cdpam", + "nemo_toolkit[asr]" ], extras_require={ "dev": [ diff --git a/test/test_pipeline/test_general.py b/test/test_pipeline/test_general.py index d8929ea..09a165b 100755 --- a/test/test_pipeline/test_general.py +++ b/test/test_pipeline/test_general.py @@ -40,6 +40,9 @@ "torch_squim_stoi": 0.6027805209159851, "torch_squim_pesq": 1.1683127880096436, "torch_squim_si_sdr": -11.109052658081055, + "dnsmos_pro_bvcc": 1.1717286109924316, + "dnsmos_pro_nisqa": 1.4733699560165405, + "dnsmos_pro_vcc2018": 1.930935263633728, } @@ -77,7 +80,7 @@ def info_update(): # for sir" continue # the plc mos is undeterministic - if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos": + if abs(TEST_INFO[key] - summary[key]) > 2e-4 and key != "plcmos": raise ValueError( "Value issue in the test case, might be some issue in scorer {}".format( key diff --git a/test/test_pipeline/test_wer.py b/test/test_pipeline/test_wer.py index cb7c43c..2d14821 100755 --- a/test/test_pipeline/test_wer.py +++ b/test/test_pipeline/test_wer.py @@ -12,68 +12,49 @@ ) TEST_INFO = { - "mcd": 5.045226506332897, - "f0rmse": 20.28100448994277, - "f0corr": -0.07540903652440145, - "sdr": 4.8739529795936445, - "sir": float("inf"), - "sar": 4.8739529795936445, - "si_snr": 1.0702757835388184, - "ci_sdr": 4.873954772949219, - "pesq": 1.5722705125808716, - "stoi": 0.0076251088596473275, - "speech_bert": 0.9727544188499451, - "speech_bleu": 0.6699938983346256, - "speech_token_distance": 0.850506056080969, - "utmos": 1.9074358940124512, - "dns_overall": 1.4526059573614438, - "dns_p808": 2.094302177429199, - "plcmos": 3.1603124300638834, - "spk_similarity": 0.8953609466552734, - "singmos": 2.0403053760528564, - "sheet_ssqa": 1.5056110620498657, - "se_sdr": -10.220606003834313, - "se_sar": -10.220606003834313, - "se_si_snr": -16.837072372436523, - "se_ci_sdr": -10.220602989196777, + "espnet_wer_equal": 1, + "owsm_wer_equal": 1, + "whisper_wer_equal": 1, + "faster_whisper_wer_equal": 1, + "nemo_wer_equal": 1, + "hubert_wer_equal": 1, } def info_update(): # find files - if os.path.isdir("test/test_samples/test2"): - gen_files = find_files("test/test_samples/test2") - - # find reference file - if os.path.isdir("test/test_samples/test1"): - gt_files = find_files("test/test_samples/test1") + if os.path.isdir("test/test_samples/test_wer"): + gen_files = find_files("test/test_samples/test_wer") logging.info("The number of utterances = %d" % len(gen_files)) - with open("egs/speech.yaml", "r", encoding="utf-8") as f: + with open("egs/separate_metrics/wer.yaml", "r", encoding="utf-8") as f: score_config = yaml.full_load(f) score_modules = load_score_modules( score_config, - use_gt=(True if gt_files is not None else False), + use_gt=False, + use_gt_text=True, use_gpu=False, ) assert len(score_config) > 0, "no scoring function is provided" + text_info = {} + with open("test/test_samples/text_wer") as f: + for line in f.readlines(): + key, value = line.strip().split(maxsplit=1) + text_info[key] = value + score_info = list_scoring( - gen_files, score_modules, gt_files, output_file=None, io="soundfile" + gen_files, score_modules, text_info=text_info, output_file=None, io="soundfile" ) summary = load_summary(score_info) print("Summary: {}".format(load_summary(score_info)), flush=True) - for key in summary: - if math.isinf(TEST_INFO[key]) and math.isinf(summary[key]): - # for sir" - continue - # the plc mos is undeterministic - if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos": + for key in TEST_INFO: + if abs(TEST_INFO[key] - summary[key]) > 0 and not key == "espnet_wer_equal": raise ValueError( "Value issue in the test case, might be some issue in scorer {}".format( key diff --git a/test/test_samples/test_wer/test_wer.wav b/test/test_samples/test_wer/test_wer.wav new file mode 100644 index 0000000000000000000000000000000000000000..f3ec5880d1f9107fe2f53e674d65aa25cb449c68 GIT binary patch literal 18604 zcmW+;1$Y!m7wqnt-Hp4ukdOom?(XhxhePn-?(T57ySuwP1PLL;h`W1scc%NV`@j3h zMP_GuyIo7Ipb(F+@xu zb9g#$OVY$cK16IGE5sF1SCo;%;x~Cfx)2}IjdY`yWDFTigXn4zL*~g$RBH!>rd>8L{Cnb>&W#z^N%0+4?`tggruCkJTB`b|#q8Gi- zUHN>>+>3o24L(jwpLub*#xJ+rShA2yT3uU7kA&U4M z247$29PP#k=|Al#S->`ujJoIEw-Pe(gkD^Eu-Pw zhV5boc2!s~NvcczMSb>_&LyiTAvI+tJg^@M*k&?O{33Pe6*_>_AvZ`j)|y1o&f>ag zLbY_D2&HF97+oX=i4kO*cpzj}%tORLNrToMGC3mhRhfJzUFbD#PF~PMWPo@}YO~$k zA3JL#jRP9IqV1&Vq$fQjJ*Dm09kHHvWC3&T+Xm)f!k>qRvuJl2ma6@qdk-AYtqVYa*L#Tr@qUV{#|0mDXH7D`#t& zW{fsa+f6f3F49bsJXs_5f!dQ%>Qx@7Du$hg{)XRG!N!8BIu(IcDHX3PT31z6oGUk0 z^{Enx8XkW`4>vO4m=j zTPE^v;wHwZztjYMwW?E<=9Rt5pOy_ReNyOOI;149=wSgZ*;md>*OZ*7IH(_}pKl1_ zPS{Cpq-RpLZma20vr!hytf-aFX0%O&)n=P=YuW0KMGy0VItN|8{FffZZ00G?^&_eh zDo<7nC@C&|QU0`OdqH+dVA;%~ih{(Vkg^&jyr@n2&GH8oL-czM!Nz#CIsHjp*>~B+ z#L1+WiKTgUi!95%ma8oXS;bqLT02-BFdJ$1&iuA!rQAXuPQR-g)jG?#c6_(~#E66T7RCuspU2)Hnl+x;zBg^I*`|%aV=RBV+kpIe2y2GaC=Jw{H zma{F+TT0eVZ0A`owW@A2*IKdcZ2rysmx-O`r8Gght5#S37@z6aR1Pe!Ri0C{ys%$U zR$*q|KLuk88W(ITDk{8Mc%aPN(5j+MRhU{ShLAIKgl3O6)x^o{U#q270ahn%&20PH z<=UG$G_yNzS7i6E%^WKaOPzUt6MNl8sXpDy7aI%pPb=4#2bJ$H*f$k@j3HsWVj-^S3 zuK6Z8i?dmFw+z$FtgO}PP8nZvit>9DWEE~JYiG1j&xug&71K}Kcj(_^|%p?4iu$*tV; zvx{@!oPcvC+tdf;ABtw=%t*hHJ~Dk&!sFN;5w?-vBJ;mCj2aO+FU~cib?Nsid(#Q- z!-AY@O|JK%VVfo?P43pyHV6)V88jhaz28r#6pOtUk^0<%8KrH~yT-Ys_@)LW*2pQz zYL(SKKR)|-%A2%0sY!A7Q*UJV&9^KZQDLcT=u*esBA|BN(Y0IDj|goYm{Dzj{|&bT zE{&Z|n}=$B<^ENdbC+d}PCk0GoCe!Dp{D@H1kD@HpL<5 ze*T8MYsRZ~Rd&-Hd;3Osd-&D$-RIHYy})aM>s;qU9z`~;=FhGA@Z`b~g%2|RreDpO zlYP8!Vddt^=E{C!Q0dJ4Nx5rNR>q!)+ZkQ-=Tu5e%8jBmrgyEoyBrAete#apy2f$e z9=_>*(>y}mT6r9>&NFXme!6Tzc51qL{HM6+lr70qaw{sG4A<3js%6#PqBi*((k{d| zjSc+sI{sqX+06I)&el5|CVBJ^X&(F}Kn{rYQoJ^~XSxOg0}a|5H#W^H_>V z@J#5Jbw7J_rMIwWb-~*T^hfiHv#TY4h#e3;E53KeoQzrJNtQZ?ajvziFR2ky?Q-BA z?*o2kJ!Oxt?v@_C9a~xTH(gWFFmGbg$MlX#Q&LYB9;|x9E3_LmseG1rSa3M!dd9JY z4)OKV7U#ar86+p$ZgB7CA5-gUt)QB-gM)p&L!NrXc<%JzUXgb9?A~kD5^YwKq-L2e ziD$AGmtEjpH0BoLG{b4CetXWLY>9pLvNr3y$Z#GlbaYxjqR@t5a6@M(yz-R|EbA z9jxZ$E_u9%}>f7r4tsPkNL(m)lJ^=%~rn{MX40c=U9_)Nd_9&j5txs`IXqU({cwslC zxw%$1j`yY=3qPg&N~|6=;@if*TQWNo9nhY(pX--dvm$h1t%RVBK7aiNdBr=#xioTE z;_%oh+p1}~XZEgi-?(WB8&X3vHDznG?@i`O20o@JKDAT)#2-^Wrbk#Lcw`w3Tg(&P zrJrlK;~MK&t3`Gpwv#M687p!>CJ&0wh@F?XCbduTH+6~Ty!Nzl zaY14Fl9-bb_dY-UWQ-i15w3PI?dsxM?P^V%;8?#G?mJu}9Ch~N?OWS4u{q`xpjn*L zJ4+Xr^nF2mQv9W)g(8kT)?`{Os~VK~GK>BW|M2|#%!q06L6x4ma+}_Me}Y)>5&tW$ zk6or&r&{>hx3HUTUgUhn!lkT#>B_&p5jA21;-|*h8dr)g@@l&`Qdp^bu4nAyPuC(+ zzBP^cr+AR|lEW&m*xGmNr2B334E1`X;o(Y zPt#xhe!q;_ROV|k!{b(s3w8H3I8$?x%V~$-Zs#nE?MHg;_TFjN-!xJQ&kK)j8*%7c z&FJ*>TY28JmDMYgW71CAvWUh1{p0^9>+kZSJl!jYvwo$a{X>5R-*q43xXO91^JIs? zE@M0@t=DON)hl_wW844G{$+Xu?J%x#`R9hE}o~;*qT?n zT;p?fOTT+A^POEhdOFT@nCsr%+1Yxx{JW@6MyvQ95#hf^C%L6HFPu)9ZY1fdwIpto!{o zYkclc<9d_rx@lywvY=>C(tw!gzx9*#g~u%RTb}UvRdZiJbMGjRGqwwyi(U8IeRO}| zINNfePL!6XwTktM@QsZ5%QB)X{)#G-##T?n`>KKYyW*6Dn(4cfvz7j~*BxpFCx
  • G7WwGu4T&ZJxrTWvH>;UVj4M*3Xvo$DOwFwZO5sg*~x?;^K9 zj{Zzos1NG|wg_Qm7VtT;7m zs_}{SO$RThsb))z=6SUedPOaajsN?zc$9_SVYc7IkXF@PJSv>-TBvr;t{yIvs-5yP zaa(8^QL!k=IV$2q_}AutkLS#l%5C~Pv~e0{^0Mr1*3aKo-y`DJ$A2igWm)EYI^cAu z>?=H*xuls^+AnfWaINF>+^>hLhsEa->y&Okl0M$~);@Mkq0IiVdFmYJbVJ*&LQM;f zTKILtkD|E7xs6QTx*YQV7_{AQjmLM#1LjYyG%m~B`ujWjuk@&4*{y0*a@(JdpC*1i z7<0XF7yHL{rOOGYE0#r7bJ8#Tsr6;>r$&FTr)C@Ldc5^}5$qkh-P6}C%dxkv+-ZPo z7gsspncsV>CE{0R$RFSD77>r5lQU;YjjT&NmU^DEwpAY&dB@%U=KFEhm%k~)i$cs+ zcyA0k9$Z$_(|eX@jCHB`HV@%796x`2 z{^@@3^Qdpv|7<9uE-k%MLrymuS7&&wp#goJ>w7(Rk!?2HdD;Au8kUyFpZ`_yY1HR? zvE6etI$sZC;F8dX)h}3`S7TCFea(Kk>QVmv@K>52R>>Q*r@Vct?FjAC*uVa$MxBEH z^}iDM#$}~_AKPg4P1)77d!NsLwfR*1Jv=)?%y#-xqjsI6^}G4?Fj-UCHg3YF5pS+P zSoe7Qi=?-m|1`?6w)o=_TCHQ_{I>l%tn1yr`Pc^cYhAQ+)|{-Uh}-n(?Z*-Cu0$;O ziH$=b;_%%$C-~hefs&)Evw4GGH_JOlYNRt z|1&dX#F3Ui8Uva z`ZVv6{Oe9(M)qR;LYq?e%#eb5AvLzwx#rJZ;%$0~9_da09ExIpntU?*vhdr7AIZP+ zBAY~Bi+gBQ(QsJT$)jJ)8xi(3yiZSi-<#&&5}G|vf4Tob;*HhMet#}1s>>ToTdi`v zGlKTk*;2PvqgFNM_~lnSYhznklJ+h7SoHD81s|LL^!WKBswyV$_sj2(B1XSEm(wbE zXveTYCbRq(9a#Kea);{sHDi-X?kGRt8?vvFc*)Vy#qx^#o1h~B-Rr$; zb~&gZc)QO76IvP`&+5s-#kqTKP5cZI{P`@7=s1e;QTIZ)m|3 zv0~nmf9Bp9Q((KXbVb^bn}6=^iI9KPjVb+Ek`PuInoCnBcpGL_j7|#rw*TX~mly6AJnHz_CA#MiPx*77 z+haG6ZZxuJc)+Ap9i}l^b2;DRZSMQ*@0$0k9@f4c7C9=@CboL=7H+JO*?2*N6}^5B zN^JJG`GeZd)}o|gLeG?dJpXvt_~8Wy^rm#bQlH*3ZZ4Qvvb1z{!piTtpPzrfkzj~x z5@(jUBJq;hv9>R}eeJhx*x``@lfv7ywM{Xdo$~V8tCu5RCOw(=M0uyAc;!9yt zJ*)bh8XfB%>pY_Kv}S>=qHEuAoX6f4H_G0hT0iFU-dUy z+A8~#c6sIr+Cr(BzFC2sOEc?amS*`BTq`XqUtN?~5?QvpI6u2x(%hoXPKh;&YlhVR z99-pJ<;JP5pl{mo-w}T_c?a?v<~%B4ysybuO#?AOGtH#8X@tXU_aAQluDjea-P+r` zX-t(1>MFHgdB=(!s!mN%cgPjmHS%#Ok15h5?o&BLzgVo`wbcnMMYD;NSHu)+3-j_C zWHwD3mU=(2dR%zoqO6^T;|-Uz+%Czjms^oTNBh;f8N^I+FTPhiyzFOj149u{Ggiu{ zu~^L}TFngeg_a+!{#YNgnQiN4ma1zk zf8ZOGGnLya`d8WMk0@uvZ7A2%*&->8T_UFqTl6QY>X&5YozLHyw=TbLVT+P!dc9%3 zS|k$K9?4GnN&CuMUx{bG#Oyt z(Zbg9k>v{WKjwP#-zLv==j3-%wiH9xvLrcIQ(c=ShijjjywncVG}SE7dFgg)=HkAS zXg(RsoS={$G&&z+OQlRDhHA#`#QGBiFNS}&7q#aEcspK~!av!NJT}qErC-GV=f`@TbeXBkY9mF|43EFp# zs32|WaJGQ$WjRtL{RuU3H&5ZEycgH<9;7u~0aH0!J}5J(hhzo^AsaK=hJO;T)g@Td zX?!w2CCq3c8Bc6zbJ~PtaeJ{6YJHj7LA4}aP>Z_=H{L+i@w+00kLFHLL7Di*m+`}D zxH!TGsAqXT4-yq(ta?ywu58pFHGDBV(eKcYFg}A~JVuOXyRqJIKydX)<(@%rSZZis z7^|4@WugzCBA$_C-a&nCEHYA6R#)?l{Fb_e^q`jX4fiv?Gt4x6tctH%VhB+`@&V)` zODC7rQEELtMp>*Mt6!@a)%hF>sgQUr^}Dh|U1Ypf4vD)xeydIs&n;3_xl~9#? z8GX66n8#nJZLvB5P|SY`XFgKZs!@CZbkuQ*M)c-q)#s`mAI{^|CK$_5o^QOOp5pf~ zt1)mE>IxUKNQA>fu;O)8rfyfd8#fyZloskXet~?0W}L%p*i*WK4k8Em6#hs!5nIxL zgo$XZRS$87hlvK{Cv@Q}aDo!)FgBKLAp79`Tp??qFF)e(yuR=xbD=3eAQxyL`$nJ9 zesl)e0#!Pl)}oA_A`juuc+la9W)m7m=d-oaYgWQO$-U&CY#AL+tcW?u6Ej3VaY9tk z)^ZkhE|KgdCJFYCaaJIGlJ2nUQZBm*{rr|_DXxj1qLRFVUcQ9|vjXNWYve1^NNI{R zOMWG-lM2{;8cgSt4aAA;0cK>Avow_^GL3Xm>Mf6y{xFF(hXx)d>ch2JDx%1BdY3Jr z8>uOq#oDrd^d>2WhHgb#z=NS8Ow@w^Qz9J623jF%!p75!4rCi?MisGK?BvCKv8doz zF&Ep!XHk!A5uL;+9>!0@+qtYZSMTw2aPGFkA-K#N5#TB7)<_tdkh z3u0wEG@G#^o(GF`?#aLM06cd%r+lC4i?Lo-lle)^`eSmA9KpY7@Wj50RiaXC!Vb*B z*r>2aR6E0y%;moYBg+Y+XGl}>m{^nEqP-Y^*)j4$?7qdKia&&Z880+cM+Z|+k|dgw zFfyGkr>*HVvYrehdeWGlqF-q~osQVqO6$X)>_^Vvx*f!dj=}G`&=4v}e<6VpD_xZ5$onOobR9n9PCA6T&>5_@ z)KVHHC9uoV18I_!!d6L_r2bNf)PogMNm?%%*ak@_pO)URTG9wfWgVrZ(n`r+&Sn5a5n zTFv^fQaXdSqHD-W@(Hs%88f$oJS3F5u|oK`Q!ukzNn7GbK472L<^cAhiX4AR!12E5u9wwbh z1nGbr+6NxhY-T5YWC`p7)3H-@EwWf3Er7E+fX+j_yE03uE~~&==L4DNGfOrRh#5hL zuo3ZzaO45hCg?nQ~e*o$4lfPsx^<&GiE58#5+Ky&mcE*veL?Xk<1ae-S z6`REnzK72TO7#=jaJr8J#g7Pc^eBXpzhDbH@z)Q$I!{$!Bg@`I{`CeLy$7y6Ke|~eu&3Zeu?J*H%F-hfo2Uj;lI>q)eWzh$D`n@59FJX5t{%vd-DoD zN{j^RX7C+w>Q{o1)CNMA0tbKc2r*qa!ttFhBT?cpc1ldfgQF}U+-k}YU^FEP7G@4ciGmF6N?Lz-T z8tqIQ1I?zw$1NczX>)Wzj7OGQMb{t#?vUQ3fCQnpA`-~Cll&lW;qu-iW^^B|K+Ihx z^=Vt`4m`L`Q|LRY2L>LXpQ#a9b_u-%MD7Y)-atIaLGqQHpnuT6@mh={r$}wui4G<6 z$sXFCt|YqP|rt&ZSd)GTB}Hy*7n22=7CHIYpRAOi34Bm4{RFJ|M_Yr#yt_*nI$au;a! zTYbc5A?|u%ggwP!Jhu{^H#0;M?~VuxNA_<;!jX?m$Wh@X_KT5Z00}0wNQT%b`VyJ6 z7eeU3m7IhYv2;Xu5HF&NU8ED{a0>Op&RS33lS;&z8Mz^viiTnp?l2!t|3|WzdSc&| zkblWwIPGc3t{ccitoaIZUwjfvNfI3QhIA)wLwBK{ppqn#*N78mbmoN7V?dg|aOgW> zuh|gvYJgdf6_FT02#G^aN)~wHNko3Wun{-WadZH4^%cDw*5E&HaZS1SAWVo5lQH9u zz+<+GWbsC9CfkJ>SWz;1DLRQL@SZ2=lj(_f`wncs3;1{&@el9LErF#mY9{vq_vr`s ze-wjrCvQNwAY zN!10TMyeq@Y7^!DnjYf5;ZJ#U&0FKbsuX6g?o?xxOX}>ZK&5fn33UkZUZ zW74hii`q>L6Zhob7VS+=YEOtnrLv@N#o($VN;mqNx$CZj_ZakjXs_}|758=HiM8xx zwnazec&(=x$P##vdPG_+idi~cp=&Sp)B0u(@>Jbi-7o2zp;EuRs-SXirM2iU)n=|H zNzznlt#KNeXIbjuA|zG7l`3}jd3Ar zw!MZ5HPW!3&QTtzAIJrspj0R6{5AKcP2?`@tu$ON!w9C6bfKvA$R(zcW%gFSDm7=z zdAMk5JZQ|)Uu6@e2ds^3WTzEl)#0kEm7R@;jPsC>QsghF+XC@C3$cwFh)(v`gh<{R zKe?l!^1zJGfB~OVyD3A7m1-oe#^vg9rH0X5`9~RO_^kJ& z%Zzr^2%fcG8K4S&8hpqbZhMS+3B4-ENHpfBSbS118Df;bJgoAT(ufZ<Lch9_ch%} zJMCL$6hzI#PG82>>pZpB*%)b&_6L0>Rw^|lS^A@Cs-3MXl=I|$@>BHWEC8}a@T-~x zR!6echG`pXca!3(YsNj|q-M5d4HKp<$KbKb6ffd8(Z%k5jiA9So9UrQB4nvK~@T_LtcSs-hZHwTcyb;X;vNuP8&v8ewX5 zSJx1O;gN2XX+K)3$*#PoWJ$l+Yr0yF5=Y1j9%$HP*lKhGXFROU(|w_NV7;bQM&nA} zlXfxbZWg6mXmW;D7=0?AR<)GkG^XYnn^>EZ@_EiHo*EYMCL&N;Zb~%uSepJtRnIEF ziaOdK?6zqiv)U$}qOY<-KZHkWUzlf@++h2)-;JxPUKpy27jn5SNz>5c7wg4W7;=DY z3wU4F$K;-shv{YAeqKv7P(CS5**25k=0D8ln?BXtKqreizpm`pEY-|4nQ2l>qoBvJ z7Qb$20K9Q!yX9k=HDV68QBEq0`D$IL=>hhRmMfF028hw*7dxTd!2ER8wbs&g<+t$y zKWrSO>}LMbM)Hi@<{Rh=l5S{0j?ov?Q~H-)l+Tf)JX^ds%++r&Mksw~JN8++PCBcF z%1Xlx{XDKC1#G-_2>-@!tEPq`W0rDXZGcR=oS3mo$_rzn{+8jDQRsJwLR0}GX*S<0 z_LB;9ojo#GskK;$^p+)wBgVs(yY*?tOrv7dQ7_3|)~Ra`>N$O&{2YsnSltzY0e4J=R2eTx$mOWYLv@p)oa2S!5Bo#9h<%EQ zQ8oq(6R^cIa!a;|*5&h1>!0P-Xc9UdJEE#cBlYDo!bfbR8R{p#NZjSMMMue*?5EdI zm+TRbg^ukf(O>T)B$!n$g{Xx{0bn-*%1(`j@lzrcG!=d;sw zliNx;?2>R&$BJCmjc#Qr@_OwwX%^;VKe|$zsb^WSW|A~bHkYq~DHo}6#&BW}?l?wT z$o5IKs2g^0G>@lg@)XHd+Qbe+ms_Rm5qE_fO_vsEOzB#6jB%Pn%sHJPtYSD!2E_zLu^-bC&WWR1vbR35jWqjZ60^9$$~!-t^j zw7vSv_)eLlb`mSuLg33QVDS>Qn&^QpUK?^8RmTBiZ)0oqfLO^Nqt@sJRx}NrjYe?B zf5B#gRDxdB_r@RUd@+S9La#pM71RnlFqp0<0ir)|CkB$^YCADlT&0`D4E`PU*Ck}; zD{Pl&#&@FrjzGunVR4#8k`#3&*$9p`3hLZj$&p&I6~Y#CcN)FLWps_C!Miz3BAAxc zB|Z2Q_MXKME!oAsN=`&#gR!!m$y(7E%J+TLs|wTsTd;Cl8X_`?7xLH|T7{K=LmEN1 zSxalP5^9C2aT9iI8E=N`(&#moPj9fFQgvb!+r?F^U_OqW*cKV>^$v{0i z7n;UlbjnXbr~6D)F+tRh6+lhCK-2NO)2O9evla3UX*Bx5m*Kt}P?dTxYnDNOlR@;m z(4&su3MN{ed;xp-K`x>DHi+*MozR>7mxn+>3MP$cpzsvy`BZdAPvrlKc*040blC0@ zAB82>Wj*G-6ExGcsCL?lyXsZtx%y3g!XI)4npZ=v6$_C^9H6BiBX-o3>ntyzUT<=ggUnxxN{r6jVGyj{DK(F_Yfm5 z75=m%w6v$FjYgnDycVjiIV6@I0IOX~c8TLW7m92Es?AE&4(Fk$P2ju5OL9f@7bnOG zaOs|?dF~1dwKkk?WZpDOOcfk;SZlIZtR{z1S&X51L`B6qoZJ-~$zIx?grIUCMfQQO zKgavG##w-)v@Z#S{@_Y4;W@eBbvH4N{%{HijS|l2o)19HdJNjYOtu7T6H9&RCe%11 zh(u$dDkVW7O-D^qfqZ9yDy4uNr1j_{tZF04}CC2EQ0EqC>B8@Y(|1G?ph)qbxA*Tn>QCtga)-z zZ`A45c+Uj8sWmF62dD=75N}i?dek||yt9}jO85lS{!>sfZb9$#d{k0zNr5FbwFlcSH z`Alf1eZ*C~m)p=7YLMwTDewxX0a~CZosqes8|g<9#9*MKANJ8C@<>FYdJlyj-Gn?P zyP>Hji5$wvTe^n^kv&+0wWwj9LFM#kt#HP{nVy4YI*>JG4d{HyoSj9Vb_3}Iw5yJ6 zGkCof-9)z_dN$KV^f~zk?Ls9hr~#`!oQ1MlY&h~p8@hv(k$dD8{lfY}gR+nuStV2n zBQ&sfY&8_Bd+ZZy#g+o6qHu2H2c69l=^3a`wJG}CvD2^7uB?J(vOCm)xid#pt(CZ{ zCmSqvlxj#D*a0?-ZDe<83n;ORBn{O11oj>Z=rpk+7$^LoLp3D#L@{>}wV)Cn!n(KP4S*&yP~ir0 zf8Lolg}(6P|7$*;{|oKGUsyrKK`9A#H zgs-ciOhlv7HwX{$5z1J!`rltJ^F~l&P9lma-a||5oeog+PhyUyVQMoyF757X+Gv* ziwGjO;9M=jc-8?=c3^Z}h=Ul3eUS!L?+z-rHPA3-;rZKvkf<5)jEQ)Lvv3e~;A3D; zxiwU54i;pMJw6rcPFK-K?8Dw&B~;8&6LF01=Qp8jPsASI$1{=3cZ;34e*pBNhtNa1 z;S@q$?#RQ`aHz?_ytP_WeV`75Q+`?9q-H6b)jR4k^@#cqs`VE&8_IQO^_`llHiDb? zuewhC7n+d=+{d$*+Ei3!5MErCS#(A?fYlQjKr7U8nI=UG_e z5u%zf!GD4O|IJ*ST{Toi0NBgn~OC zJLNgNn6*IcSSV2)Ne*caW$-%oPc>>JF(iwurhkDN{o%$GVIO8;e^%lIN*Hp3j?IM1 z7)n3FgV}?#5S!Q}_8chEkh!sQU?O=?#Jl3UR_Hx`iE}fZ**Mk}u1yJT%8b;EJ)x1* zh3TPiW!< zH?&PxMCAn{)1~Am&O4Pst!@aeSw@ooi#g1Q4H))x;6e>D6}f1wSR!^qaZD3#_~|nA z>W4`bXS@ zlKqm8fOb{``b0GN#ZNrvCgS)N?mbfc#d)+T(5J^iCC$OUe}j`Pe!zfqps~?hE#1s1ECalmvx)_esVd&MH={*|CCNn$an)YC8DxD2q zU=juYf`_{ne(N%<>LXG`dSg}Xn1UMRGuWIHkbVy$>OY_3D76JwJ;tI~U$}M6ajMS% z?WVotAkAkZ*-^NCf$&_s!6A4P)0?X8jy}+0{h?sq%oWDdIJ%#&WhE+;|nstk>0FUn`;=lz~ zqFdn!uZFzQ0It_BI2(?L11jFa%}{_4XP`~n@pmdkJhTQ|=q@h7+pxha5`jJop^EE~ zUChO8{^I}ma*@m8HM%mYAuF6lOlBkYT0yrD7jB1XUH<9*KtM|Aq_# z&#)lRkWEe_hlC*$9E6h;3q3plETtR#$D2_0Q$+-@Sw{9efJps_eu5zEqN&iiry=U9 zBXV~EV-^5q+JZAiBeO7?BkX`C7jZW26>`KCI2;e)^ZFv9M`3j?l9_NxkHK?V51yHf zHTneo`8!6lggnHasVSA% z$n^~9c9{GjnWzwB={}5R5o)Wcv@70WPwbt1ULux|?ifR7xE8KBZTl3j>ejI1mLXWJnZ-e z+!T(~zd(IcFh*-|hU)lxGS>Yc_!Rx&KX~y+YCGI%A?|k&HN^ye2N~ikIKgMEqaRN| z??{FEOl^p)(Ht0Y2Z*x@>U})Uv0B2hw?TC}0vM!)TX6^Z%>j7U3kc_h(Vqs(UJr~r zffJ%BaNvH3g=8)L4Ca0VE}tLpzZ!WAwg8O*eDf}5ttOd|*N!8eSfNd%6}XlU-Gdn% zgZZ0^wfh2B;~+XhW`Z>?#!7X@-v3MN!6&o8D1(40kKj>cz*{olkn|`Zaw!=FY%V5e zNLS_nS929q>!wW0dNPViZy|kxz4;maCo||?bgeW%u6Tj^bP4N$D{Hb(cx`iZa5w>l z#!?FYITC+=1|GYWM&O+2T*Pi1@b141_5}PT1f$za|G^cb(8F~P(bIrZ_$0NE%@W~x z3DiH^$pKtl1=I}{wm@t*Vh2y87h>@g-e(gaSX;UsSA8Yd5HEQ+=Q<62FFSyqen2)4 zItN)rLk5HOFXi<_l4wI5kVmIt=8mE2S^x)WKRo%daEG2?w>^QcR3e%p&Qsx0R`6qR zr_ZCZNJoCy$A5Dlk%}tg9z1(*F^|6l(>Q`ia1*C+l6)TGX&89NVL0utP-pr;t+zm} zw;eg87^pB6eH{{9n2qqc88X6CAk{?fi`q#e4xpxqLCiE1jj*#$ilc~&IH(W@!7{!1 zA~>iMxdt6VtAJqJfh&`dp3Z0|C&1;NVxL|kGf4;V{D<@~{TC7P2*|M?O299$`m^Xm`bt`&wofH>v1`Mz zQjh2cwhK=AN@S9m@I5W)a_sGexL*g7i=6ich;$TQ=xT72Zpde=u~Ta!x<E1+&FkM9_WUTrqs* z*1%FWasW7!GdTuUQUrIhJJ#ZXs6%$5z6^$j`4T&0 zCHnY2Vn=R5kH9sc!dsll_d~6E7cTimIF}vhFM0=k3?q=~P1s}1uqW$)eE$^d@SDca zMfkLUA?P-n3uZVERig>>W5?JzIPFPLh^vETe!`~%v_UO;kqt)eY7IrQC#ym%80bxQ zk2$kr7q{9HJBJ&Ys|t308pj?8fV#&;V4bP1r#o$vf1t-uT3V zEqHEQjAAT0=mLRKQ_wNd9gH{%IFk;2V={W*>Vb`%0hZiCx7=X(0QZ3~If%hh;zbvr zH)ThB(K82tL7^ge1KH?l|*fPHo;bM3jS?<@Q9}P&3_*D zRG@tv^bk{^eH(mT34FVZD(VF)*AA#GLZFGg;~i0t^~D{I099TgpM65yB_h@~^FL}| zaE}nwC;7-kA-o>`g43(0ZyTcOwB_c=X{~vLx)zai4Lvq9G0tPW4)y}v03}r7+-V%8s4jJe#_YxzZLJR?G83ujpCb}rTbBF(W%78|MpGTkJ zIqoVp{PzUtU|&(qlpt%hfIemiwY4jFOl{oBfbo{7U2zKiEgZvg)mn5Fy-?BC0=7&> z9E^k(kpXrZ2^Fvts@%hf(*L>(-vS>8BTk(W6ZP@=2hD(v)6kn~0Y;`n4Yvynt}i&_ ze;rvh(DP=Bdan?Oat8D2g6Qai%HtaHeIV+=W$3ip4^EhXHL`(DWQD$__oSHINAJr+ z{52YDV-H*p!OtPU|0l>hmr?mF0GlYoYhU7y=Yam{sE?k5YobvVxx5b$$qhWUKADXi z((C^!Nj<1&W3k#tph?AnL5!p|fL_~>xr(4U^&~UU`xpw13B5(oXk&o^E@0IEpt3s! z{kshBCLBAyP?V#(>jquV0jdf-9n|;MV8RQ5nW^YwwnDZlN5>=wk~Rm9)uj)SzaIf< zB9X~-Q2)A+3&It=)&;w%5incD{cj?YvN4wp#8>Rg>)5qk;1dLVU^iy|9QNZtkqCTw z3qJA+dCC&{NgDR-E%4Cq;H}|cwmwjd-=hadMyB2X^(+x;QZ8zd0C1Jjz^pLrW^~X3 zWi0UNC*`O^@X0h7`#9j!S7_{6qC0fVlR&CQ$kHna7wAYtcS|BF*f*%F5)eB>p;tJO z=fV!WvnQE}cz7%Bq2e5c>$YN*Gw|$l$ zJ&eC?!S}BQNBGQ7c%91koQnZ~)@`5g7th{D7F& zLc@JYdSI;U;gL*7gb;j=%S9+`^P%4L!#wJtm7M^K+J>5|KQc%Z=JPB*AH)P#o&|Qm Nt3W32ggM$F{s(V4!Cn9W literal 0 HcmV?d00001 diff --git a/test/test_samples/text_wer b/test/test_samples/text_wer new file mode 100644 index 0000000..567a094 --- /dev/null +++ b/test/test_samples/text_wer @@ -0,0 +1 @@ +test_wer.wav Look! \ No newline at end of file diff --git a/tools/install_faster-whisper.sh b/tools/install_faster-whisper.sh new file mode 100644 index 0000000..45a7f81 --- /dev/null +++ b/tools/install_faster-whisper.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e + +pip install faster-whisper + +if ! command -v nvcc &>/dev/null; then + echo "Error: nvcc not found. Please install the CUDA Toolkit first." >&2 + exit 1 +fi + +cuda_ver=$(nvcc --version | sed -nE 's/.*release ([0-9]+\.[0-9]+).*/\1/p') +cuda_major=${cuda_ver%%.*} +echo "Detected CUDA version:$cuda_ver" + +if [ "$cuda_major" -ge 12 ]; then + conda install -c conda-forge "cudnn=9.*" "numpy<2.3" +elif [ "$cuda_major" -eq 11 ]; then + conda install -c conda-forge "cudnn=8.*" "numpy<2.3" + pip install --force-reinstall 'ctranslate2==3.24.0' 'numpy<2.2' +else + echo "Error: Unsupported CUDA major version $cuda_major" >&2 + exit 1 +fi \ No newline at end of file diff --git a/versa/__init__.py b/versa/__init__.py index 8471d37..7f8ca42 100644 --- a/versa/__init__.py +++ b/versa/__init__.py @@ -59,6 +59,14 @@ faster_whisper_levenshtein_metric, faster_whisper_wer_setup, ) +from versa.corpus_metrics.nemo_wer import ( + nemo_levenshtein_metric, + nemo_wer_setup, +) +from versa.corpus_metrics.hubert_wer import ( + hubert_levenshtein_metric, + hubert_wer_setup, +) from versa.utterance_metrics.asr_matching import asr_match_metric, asr_match_setup from versa.utterance_metrics.audiobox_aesthetics_score import ( audiobox_aesthetics_score, diff --git a/versa/corpus_metrics/faster_whisper_wer.py b/versa/corpus_metrics/faster_whisper_wer.py new file mode 100644 index 0000000..fe07a7d --- /dev/null +++ b/versa/corpus_metrics/faster_whisper_wer.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Haoran Wang +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import logging + +import librosa +import numpy as np +import torch +from Levenshtein import opcodes + +try: + from faster_whisper import WhisperModel, BatchedInferencePipeline +except ImportError: + logging.warning( + "Faster-whisper is not properly installed. Please install following https://github.com/systran/faster-whisper" + ) + WhisperModel = None + +from espnet2.text.cleaner import TextCleaner + +TARGET_FS = 16000 + + +def faster_whisper_wer_setup( + model_tag="default", beam_size=5, batch_size=1, compute_type="float32" ,text_cleaner="whisper_basic", use_gpu=True +): + if model_tag == "default": + model_tag = "large-v3" + device = "cuda" if use_gpu else "cpu" + if WhisperModel is None: + raise RuntimeError( + "Whisper WER is used for evaluation while faster-whisper is not installed" + ) + model_whisper = WhisperModel(model_tag, device=device, compute_type=compute_type) + if batch_size > 1: + model = BatchedInferencePipeline(model=model_whisper) + else: + model = model_whisper + textcleaner = TextCleaner(text_cleaner) + wer_utils = {"model": model, "cleaner": textcleaner, "beam_size": beam_size, "batch_size": batch_size, "compute_type": compute_type} + return wer_utils + + +def faster_whisper_levenshtein_metric( + wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None +): + """Calculate the Levenshtein distance between ref and inf ASR results. + + Args: + wer_utils (dict): a utility dict for WER calculation. + including: faster-whisper model ("model"), text cleaner ("textcleaner"), + beam size ("beam size") and batch size ("batch_size") + pred_x (np.ndarray): test signal (time,) + ref_text (string): reference transcript + cache_pred_text (string): transcription from cache (previous modules) + fs (int): sampling rate in Hz + Returns: + ret (dict): ditionary containing occurrences of edit operations + """ + if cache_pred_text is not None: + inf_text = cache_pred_text + else: + if fs != TARGET_FS: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=TARGET_FS) + fs = TARGET_FS + with torch.no_grad(): + if wer_utils["batch_size"] > 1: + pred_x = pred_x.astype(getattr(np, wer_utils["compute_type"])) + inf_output, _ = wer_utils["model"].transcribe( + pred_x, beam_size=wer_utils["beam_size"], batch_size=wer_utils["batch_size"] + ) + inf_text = "".join(segment.text for segment in inf_output) + else: + inf_output, _ = wer_utils["model"].transcribe( + pred_x, beam_size=wer_utils["beam_size"] + ) + inf_text = "".join(segment.text for segment in inf_output) + + + ref_text = wer_utils["cleaner"](ref_text).strip() + pred_text = wer_utils["cleaner"](inf_text).strip() + + # process wer + ref_words = ref_text.strip().split() + pred_words = pred_text.strip().split() + ret = { + "faster_whisper_hyp_text": pred_text, + "ref_text": ref_text, + "faster_whisper_wer_delete": 0, + "faster_whisper_wer_insert": 0, + "faster_whisper_wer_replace": 0, + "faster_whisper_wer_equal": 0, + } + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["faster_whisper_wer_" + op] = ret["faster_whisper_wer_" + op] + inf_et - inf_st + else: + ret["faster_whisper_wer_" + op] = ret["faster_whisper_wer_" + op] + ref_et - ref_st + total = ( + ret["faster_whisper_wer_delete"] + + ret["faster_whisper_wer_replace"] + + ret["faster_whisper_wer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["faster_whisper_wer_insert"] + + ret["faster_whisper_wer_replace"] + + ret["faster_whisper_wer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + # process cer + ref_words = [c for c in ref_text] + pred_words = [c for c in pred_text] + ret["faster_whisper_cer_delete"] = 0 + ret["faster_whisper_cer_insert"] = 0 + ret["faster_whisper_cer_replace"] = 0 + ret["faster_whisper_cer_equal"] = 0 + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["faster_whisper_cer_" + op] = ret["faster_whisper_cer_" + op] + inf_et - inf_st + else: + ret["faster_whisper_cer_" + op] = ret["faster_whisper_cer_" + op] + ref_et - ref_st + total = ( + ret["faster_whisper_cer_delete"] + + ret["faster_whisper_cer_replace"] + + ret["faster_whisper_cer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["faster_whisper_cer_insert"] + + ret["faster_whisper_cer_replace"] + + ret["faster_whisper_cer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + return ret + + +if __name__ == "__main__": + a = np.random.random(16000) + wer_utils = faster_whisper_wer_setup() + print( + "metrics: {}".format( + faster_whisper_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + ) + ) diff --git a/versa/corpus_metrics/hubert_wer.py b/versa/corpus_metrics/hubert_wer.py new file mode 100644 index 0000000..efb7d5d --- /dev/null +++ b/versa/corpus_metrics/hubert_wer.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +# Copyright 2024 Jiatong Shi +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import logging + +import librosa +import numpy as np +import torch +from Levenshtein import opcodes + +try: + from transformers import Wav2Vec2Processor, HubertForCTC +except ImportError: + logging.warning( + "transformers is not properly installed." + ) + Wav2Vec2Processor = None + HubertForCTC = None + +from espnet2.text.cleaner import TextCleaner + +TARGET_FS = 16000 + + +def hubert_wer_setup( + model_tag="default", text_cleaner="whisper_basic", use_gpu=True +): + if model_tag == "default": + model_tag = "facebook/hubert-large-ls960-ft" + device = "cuda" if use_gpu else "cpu" + if Wav2Vec2Processor is None and HubertForCTC is None: + raise RuntimeError( + "Facebook's hubert WER is used for evaluation while transformers is not installed" + ) + processor = Wav2Vec2Processor.from_pretrained(model_tag) + model = HubertForCTC.from_pretrained(model_tag) + + textcleaner = TextCleaner(text_cleaner) + wer_utils = {"model": model, "processor": processor, "cleaner": textcleaner} + return wer_utils + + +def hubert_levenshtein_metric( + wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None +): + """Calculate the Levenshtein distance between ref and inf ASR results. + + Args: + wer_utils (dict): a utility dict for WER calculation. + including: hubert asr model ("model"), text cleaner ("textcleaner") + pred_x (np.ndarray): test signal (time,) + ref_text (string): reference transcript + cache_pred_text (string): transcription from cache (previous modules) + fs (int): sampling rate in Hz + Returns: + ret (dict): ditionary containing occurrences of edit operations + """ + if cache_pred_text is not None: + inf_text = cache_pred_text + else: + if fs != TARGET_FS: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=TARGET_FS) + fs = TARGET_FS + with torch.no_grad(): + input_values = wer_utils["processor"](pred_x, return_tensors="pt").input_values + logits = wer_utils["model"](input_values).logits + predicted_ids = torch.argmax(logits, dim=-1) + inf_text = wer_utils["processor"].decode(predicted_ids[0]) + + ref_text = wer_utils["cleaner"](ref_text).strip() + pred_text = wer_utils["cleaner"](inf_text).strip() + + # process wer + ref_words = ref_text.strip().split() + pred_words = pred_text.strip().split() + ret = { + "hubert_hyp_text": pred_text, + "ref_text": ref_text, + "hubert_wer_delete": 0, + "hubert_wer_insert": 0, + "hubert_wer_replace": 0, + "hubert_wer_equal": 0, + } + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["hubert_wer_" + op] = ret["hubert_wer_" + op] + inf_et - inf_st + else: + ret["hubert_wer_" + op] = ret["hubert_wer_" + op] + ref_et - ref_st + total = ( + ret["hubert_wer_delete"] + + ret["hubert_wer_replace"] + + ret["hubert_wer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["hubert_wer_insert"] + + ret["hubert_wer_replace"] + + ret["hubert_wer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + # process cer + ref_words = [c for c in ref_text] + pred_words = [c for c in pred_text] + ret.update( + hubert_cer_delete=0, + hubert_cer_insert=0, + hubert_cer_replace=0, + hubert_cer_equal=0, + ) + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["hubert_cer_" + op] = ret["hubert_cer_" + op] + inf_et - inf_st + else: + ret["hubert_cer_" + op] = ret["hubert_cer_" + op] + ref_et - ref_st + total = ( + ret["hubert_cer_delete"] + + ret["hubert_cer_replace"] + + ret["hubert_cer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["hubert_cer_insert"] + + ret["hubert_cer_replace"] + + ret["hubert_cer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + return ret + + +if __name__ == "__main__": + a = np.random.random(16000) + wer_utils = hubert_wer_setup() + print( + "metrics: {}".format( + hubert_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + ) + ) diff --git a/versa/corpus_metrics/nemo_wer.py b/versa/corpus_metrics/nemo_wer.py new file mode 100644 index 0000000..16277cf --- /dev/null +++ b/versa/corpus_metrics/nemo_wer.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +# Copyright 2024 Jiatong Shi +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import logging + +import librosa +import numpy as np +import torch +from Levenshtein import opcodes + +try: + import nemo.collections.asr as nemo_asr +except ImportError: + logging.warning( + "NeMo is not properly installed. Please install following https://github.com/NVIDIA/NeMo" + ) + nemo_asr= None + +from espnet2.text.cleaner import TextCleaner + +TARGET_FS = 16000 + + +def nemo_wer_setup( + model_tag="default", text_cleaner="whisper_basic", use_gpu=True +): + if model_tag == "default": + model_tag = "nvidia/stt_en_conformer_transducer_xlarge" + device = "cuda" if use_gpu else "cpu" + if nemo_asr is None: + raise RuntimeError( + "NeMo WER is used for evaluation while NeMo is not installed" + ) + asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_tag) + textcleaner = TextCleaner(text_cleaner) + wer_utils = {"model": asr_model, "cleaner": textcleaner} + return wer_utils + + +def nemo_levenshtein_metric( + wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None +): + """Calculate the Levenshtein distance between ref and inf ASR results. + + Args: + wer_utils (dict): a utility dict for WER calculation. + including: nemo asr model ("model"), text cleaner ("textcleaner") + pred_x (np.ndarray): test signal (time,) + ref_text (string): reference transcript + cache_pred_text (string): transcription from cache (previous modules) + fs (int): sampling rate in Hz + Returns: + ret (dict): ditionary containing occurrences of edit operations + """ + if cache_pred_text is not None: + inf_text = cache_pred_text + else: + if fs != TARGET_FS: + pred_x = librosa.resample(pred_x, orig_sr=fs, target_sr=TARGET_FS) + fs = TARGET_FS + with torch.no_grad(): + inf_text = wer_utils["model"].transcribe( + audio=pred_x + )[0].text + + ref_text = wer_utils["cleaner"](ref_text).strip() + pred_text = wer_utils["cleaner"](inf_text).strip() + + # process wer + ref_words = ref_text.strip().split() + pred_words = pred_text.strip().split() + ret = { + "nemo_hyp_text": pred_text, + "ref_text": ref_text, + "nemo_wer_delete": 0, + "nemo_wer_insert": 0, + "nemo_wer_replace": 0, + "nemo_wer_equal": 0, + } + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["nemo_wer_" + op] = ret["nemo_wer_" + op] + inf_et - inf_st + else: + ret["nemo_wer_" + op] = ret["nemo_wer_" + op] + ref_et - ref_st + total = ( + ret["nemo_wer_delete"] + + ret["nemo_wer_replace"] + + ret["nemo_wer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["nemo_wer_insert"] + + ret["nemo_wer_replace"] + + ret["nemo_wer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + # process cer + ref_words = [c for c in ref_text] + pred_words = [c for c in pred_text] + ret.update( + nemo_cer_delete=0, + nemo_cer_insert=0, + nemo_cer_replace=0, + nemo_cer_equal=0, + ) + for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): + if op == "insert": + ret["nemo_cer_" + op] = ret["nemo_cer_" + op] + inf_et - inf_st + else: + ret["nemo_cer_" + op] = ret["nemo_cer_" + op] + ref_et - ref_st + total = ( + ret["nemo_cer_delete"] + + ret["nemo_cer_replace"] + + ret["nemo_cer_equal"] + ) + assert total == len(ref_words), (total, len(ref_words)) + total = ( + ret["nemo_cer_insert"] + + ret["nemo_cer_replace"] + + ret["nemo_cer_equal"] + ) + assert total == len(pred_words), (total, len(pred_words)) + + return ret + + +if __name__ == "__main__": + a = np.random.random(16000) + wer_utils = nemo_wer_setup() + print( + "metrics: {}".format( + nemo_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + ) + ) diff --git a/versa/metrics.py b/versa/metrics.py index 067b6d2..e4c5029 100644 --- a/versa/metrics.py +++ b/versa/metrics.py @@ -31,6 +31,9 @@ "espnet_hyp_text", "owsm_hyp_text", "whisper_hyp_text", + "faster_whisper_hyp_text", + "nemo_hyp_text", + "hubert_hyp_text" ] NUM_METRIC = [ @@ -112,14 +115,44 @@ "owsm_cer_equal", "whisper_wer", "whisper_wer_delete", - "espnet_wer_insert", - "espnet_wer_replace", - "espnet_wer_equal", + "whisper_wer_insert", + "whisper_wer_replace", + "whisper_wer_equal", "whisper_cer", "whisper_cer_delete", - "espnet_cer_insert", - "espnet_cer_replace", - "espnet_cer_equal", + "whisper_cer_insert", + "whisper_cer_replace", + "whisper_cer_equal", + "faster_whisper_wer", + "faster_whisper_wer_delete", + "faster_whisper_wer_insert", + "faster_whisper_wer_replace", + "faster_whisper_wer_equal", + "faster_whisper_cer", + "faster_whisper_cer_delete", + "faster_whisper_cer_insert", + "faster_whisper_cer_replace", + "faster_whisper_cer_equal", + "nemo_wer", + "nemo_wer_delete", + "nemo_wer_insert", + "nemo_wer_replace", + "nemo_wer_equal", + "nemo_cer", + "nemo_cer_delete", + "nemo_cer_insert", + "nemo_cer_replace", + "nemo_cer_equal", + "hubert_wer", + "hubert_wer_delete", + "hubert_wer_insert", + "hubert_wer_replace", + "hubert_wer_equal", + "hubert_cer", + "hubert_cer_delete", + "hubert_cer_insert", + "hubert_cer_replace", + "hubert_cer_equal", "emotion_similarity", "spk_similarity", "nomad", diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index f9d35e7..fcafff7 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -390,6 +390,58 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal "args": args_cache, } logging.info("Initiate faster_whisper WER calculation successfully") + elif config["name"] == "nemo_wer": + if not use_gt_text: + logging.warning("Cannot use nemo_wer because no gt text is provided") + continue + + logging.info("Loading nemo_wer metric with reference text") + from versa import nemo_levenshtein_metric, nemo_wer_setup + + # Load nemo asr model if it is already loaded + if ( + "speaking_rate" in score_modules.keys() + or "asr_matching" in score_modules.keys() + ): + args_cache = score_modules["speaking_rate"]["args"] + else: + args_cache = nemo_wer_setup( + model_tag=config.get("model_tag", "default"), + text_cleaner=config.get("text_cleaner", "whisper_basic"), + use_gpu=use_gpu, + ) + + score_modules["nemo_wer"] = { + "module": nemo_levenshtein_metric, + "args": args_cache, + } + logging.info("Initiate NeMo WER calculation successfully") + elif config["name"] == "hubert_wer": + if not use_gt_text: + logging.warning("Cannot use hubert_wer because no gt text is provided") + continue + + logging.info("Loading hubert_wer metric with reference text") + from versa import hubert_levenshtein_metric, hubert_wer_setup + + # Load hubert asr model if it is already loaded + if ( + "speaking_rate" in score_modules.keys() + or "asr_matching" in score_modules.keys() + ): + args_cache = score_modules["speaking_rate"]["args"] + else: + args_cache = hubert_wer_setup( + model_tag=config.get("model_tag", "default"), + text_cleaner=config.get("text_cleaner", "whisper_basic"), + use_gpu=use_gpu, + ) + + score_modules["hubert_wer"] = { + "module": hubert_levenshtein_metric, + "args": args_cache, + } + logging.info("Initiate hubert WER calculation successfully") elif config["name"] == "scoreq_ref": if not use_gt: logging.warning("Cannot use scoreq_ref because no gt audio is provided") @@ -1029,7 +1081,7 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None): score = score_modules[key]["module"]( score_modules[key]["model"], gen_wav, gt_wav, gen_sr ) - elif key == "espnet_wer" or key == "owsm_wer" or key == "whisper_wer" or key == "faster_whisper_wer": + elif key in ["espnet_wer", "owsm_wer", "whisper_wer", "faster_whisper_wer", "nemo_wer", "hubert_wer"]: score = score_modules[key]["module"]( score_modules[key]["args"], gen_wav, From 50d10c5b8593501096562c5a32d27975b7433a4d Mon Sep 17 00:00:00 2001 From: Haoran Wang <2472356642@qq.com> Date: Mon, 23 Jun 2025 13:21:26 -0500 Subject: [PATCH 3/6] fix: copyright --- versa/corpus_metrics/hubert_wer.py | 2 +- versa/corpus_metrics/nemo_wer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/versa/corpus_metrics/hubert_wer.py b/versa/corpus_metrics/hubert_wer.py index efb7d5d..ffcd249 100644 --- a/versa/corpus_metrics/hubert_wer.py +++ b/versa/corpus_metrics/hubert_wer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2024 Jiatong Shi +# Copyright 2025 Haoran Wang # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import logging diff --git a/versa/corpus_metrics/nemo_wer.py b/versa/corpus_metrics/nemo_wer.py index 16277cf..57d4c5b 100644 --- a/versa/corpus_metrics/nemo_wer.py +++ b/versa/corpus_metrics/nemo_wer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2024 Jiatong Shi +# Copyright 2025 Haoran Wang # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import logging From 718b2ed688fc3fdbb6ee40da3c84b373abd0d4ed Mon Sep 17 00:00:00 2001 From: Haoran Wang <2472356642@qq.com> Date: Mon, 23 Jun 2025 13:27:55 -0500 Subject: [PATCH 4/6] fix: a small bug in scorer_shared/list_scoring --- versa/scorer_shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index fcafff7..4b768b0 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -1223,7 +1223,7 @@ def list_scoring( # Step2: load reference (gt) speech and conduct basic checks if gt_files is not None: - if key not in gen_files.keys(): + if key not in gt_files.keys(): logging.warning( "key {} not found in ground truth files though provided, skipping".format( key From c2892433c768afb283e0603e9a881c529bd1e2df Mon Sep 17 00:00:00 2001 From: Haoran Wang <2472356642@qq.com> Date: Mon, 23 Jun 2025 13:53:37 -0500 Subject: [PATCH 5/6] add: docs --- docs/supported_metrics.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md index d72d1a2..14023e1 100644 --- a/docs/supported_metrics.md +++ b/docs/supported_metrics.md @@ -101,14 +101,17 @@ We include x mark if the metric is auto-installed in versa. | 3 | x | ESPnet Speech Recognition-based Error Rate | espnet_wer | espnet_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/pdf/1804.00015) | | 4 | x | ESPnet-OWSM Speech Recognition-based Error Rate | owsm_wer | owsm_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2309.13876) | | 5 | x | OpenAI-Whisper Speech Recognition-based Error Rate | whisper_wer | whisper_wer |[Whisper](https://github.com/openai/whisper) | [paper](https://arxiv.org/abs/2212.04356) | -| 6 | | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | -| 7 | x | Speaker Embedding Similarity | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) | -| 8 | | NOMAD: Unsupervised Learning of Perceptual Embeddings For Speech Enhancement and Non-Matching Reference Audio Quality Assessment | nomad | nomad |[Nomad](https://github.com/shimhz/nomad/tree/main) | [paper](https://arxiv.org/abs/2309.16284) | -| 9 | | Contrastive Language-Audio Pretraining Score (CLAP Score) | clap_score | clap_score | [fadtk](https://github.com/gudgud96/frechet-audio-distance) | [paper](https://arxiv.org/abs/2301.12661) | -| 10 | | Accompaniment Prompt Adherence (APA) | apa | apa | [Sony-audio-metrics](https://github.com/SonyCSLParis/audio-metrics) | [paper](https://arxiv.org/abs/2404.00775) | -| 11 | | Log Likelihood Ratio (LLR) | pysepm | pysepm_llr | [pysepm](https://github.com/shimhz/pysepm.git) | [Paper](https://ecs.utdallas.edu/loizou/speech/obj_paper_jan08.pdf)| -| 12 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) with Paired Text | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) | -| 13 | | Singer Embedding Similarity | singer | singer_similarity | [SSL-Singer-Identity](https://github.com/SonyCSLParis/ssl-singer-identity) | [paper](https://hal.science/hal-04186048v1) | +| 6 | | Faster-Whisper Speech Recognition-based Error Rate | faster_whisper_wer | faster_whisper_wer |[Faster-Whisper](https://github.com/systran/faster-whisper) | - | +| 7 | x | NVIDIA Conformer-Transducer X-Large Speech Recognition-based Error Rate | nemo_wer | nemo_wer |[NeMo](https://github.com/NVIDIA/NeMo) | [paper](https://arxiv.org/abs/2005.08100) | +| 8 | x | Facebook Hubert-Large-Finetuned Speech Recognition-based Error Rate | hubert_wer | hubert_wer |[HuBERT](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) | [paper](https://arxiv.org/abs/2106.07447) | +| 9 | | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | +| 10 | x | Speaker Embedding Similarity | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) | +| 11 | | NOMAD: Unsupervised Learning of Perceptual Embeddings For Speech Enhancement and Non-Matching Reference Audio Quality Assessment | nomad | nomad |[Nomad](https://github.com/shimhz/nomad/tree/main) | [paper](https://arxiv.org/abs/2309.16284) | +| 12 | | Contrastive Language-Audio Pretraining Score (CLAP Score) | clap_score | clap_score | [fadtk](https://github.com/gudgud96/frechet-audio-distance) | [paper](https://arxiv.org/abs/2301.12661) | +| 13 | | Accompaniment Prompt Adherence (APA) | apa | apa | [Sony-audio-metrics](https://github.com/SonyCSLParis/audio-metrics) | [paper](https://arxiv.org/abs/2404.00775) | +| 14 | | Log Likelihood Ratio (LLR) | pysepm | pysepm_llr | [pysepm](https://github.com/shimhz/pysepm.git) | [Paper](https://ecs.utdallas.edu/loizou/speech/obj_paper_jan08.pdf)| +| 15 | x | Uni-VERSA (Versatile Speech Assessment with a Unified Framework) with Paired Text | universa | universa_{sub_metrics} | [Uni-VERSA](https://huggingface.co/collections/espnet/universa-6834e7c0a28225bffb6e2526) | [paper](https://arxiv.org/abs/2505.20741) | +| 16 | | Singer Embedding Similarity | singer | singer_similarity | [SSL-Singer-Identity](https://github.com/SonyCSLParis/ssl-singer-identity) | [paper](https://hal.science/hal-04186048v1) | ### Distributional Metrics (in verifying) From 878ef17cf133e1888748f8f55a78158289168196 Mon Sep 17 00:00:00 2001 From: Haoran Wang <2472356642@qq.com> Date: Tue, 24 Jun 2025 13:16:16 -0500 Subject: [PATCH 6/6] update --- docs/supported_metrics.md | 4 +- egs/separate_metrics/wer.yaml | 23 ++++--- setup.py | 2 - test/test_pipeline/test_general.py | 2 + test/test_pipeline/test_wer.py | 2 +- ..._faster-whisper.sh => install_fwhisper.sh} | 1 + tools/install_nemo.sh | 5 ++ versa/__init__.py | 6 +- ...{faster_whisper_wer.py => fwhisper_wer.py} | 58 ++++++++--------- versa/metrics.py | 22 +++---- versa/scorer_shared.py | 63 ++++++------------- 11 files changed, 85 insertions(+), 103 deletions(-) rename tools/{install_faster-whisper.sh => install_fwhisper.sh} (99%) create mode 100644 tools/install_nemo.sh rename versa/corpus_metrics/{faster_whisper_wer.py => fwhisper_wer.py} (72%) diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md index 14023e1..77ce6e5 100644 --- a/docs/supported_metrics.md +++ b/docs/supported_metrics.md @@ -101,8 +101,8 @@ We include x mark if the metric is auto-installed in versa. | 3 | x | ESPnet Speech Recognition-based Error Rate | espnet_wer | espnet_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/pdf/1804.00015) | | 4 | x | ESPnet-OWSM Speech Recognition-based Error Rate | owsm_wer | owsm_wer |[ESPnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2309.13876) | | 5 | x | OpenAI-Whisper Speech Recognition-based Error Rate | whisper_wer | whisper_wer |[Whisper](https://github.com/openai/whisper) | [paper](https://arxiv.org/abs/2212.04356) | -| 6 | | Faster-Whisper Speech Recognition-based Error Rate | faster_whisper_wer | faster_whisper_wer |[Faster-Whisper](https://github.com/systran/faster-whisper) | - | -| 7 | x | NVIDIA Conformer-Transducer X-Large Speech Recognition-based Error Rate | nemo_wer | nemo_wer |[NeMo](https://github.com/NVIDIA/NeMo) | [paper](https://arxiv.org/abs/2005.08100) | +| 6 | | Faster-Whisper Speech Recognition-based Error Rate | fwhisper_wer | fwhisper_wer |[Faster-Whisper](https://github.com/systran/faster-whisper) | - | +| 7 | | NVIDIA Conformer-Transducer X-Large Speech Recognition-based Error Rate | nemo_wer | nemo_wer |[NeMo](https://github.com/NVIDIA/NeMo) | [paper](https://arxiv.org/abs/2005.08100) | | 8 | x | Facebook Hubert-Large-Finetuned Speech Recognition-based Error Rate | hubert_wer | hubert_wer |[HuBERT](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) | [paper](https://arxiv.org/abs/2106.07447) | | 9 | | Emotion2vec similarity (emo2vec) | emo2vec_similarity | emotion_similarity | [emo2vec](https://github.com/ftshijt/emotion2vec/tree/main) | [paper](https://arxiv.org/abs/2312.15185) | | 10 | x | Speaker Embedding Similarity | speaker | spk_similarity | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/2401.17230) | diff --git a/egs/separate_metrics/wer.yaml b/egs/separate_metrics/wer.yaml index d636450..1c63624 100644 --- a/egs/separate_metrics/wer.yaml +++ b/egs/separate_metrics/wer.yaml @@ -58,26 +58,29 @@ # Word error rate with faster-whisper model +# Please refer to tools/install_fwhisper.sh for installing. # More model_tag can be from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/utils.py . # The default model is `large-v3`. -# --faster_whisper_hyp_text: the hypothesis from faster-whisper ASR decoding +# --fwhisper_hyp_text: the hypothesis from faster-whisper ASR decoding # --ref_text: reference text (after cleaner) -# --faster_whisper_wer_delete: delete errors -# --faster_whisper_wer_insert: insertion errors -# --faster_whisper_wer_replace: replacement errors -# --faster_whisper_wer_equal: correct matching words/character counts -# --faster_whisper_cer_delete: delete errors -# --faster_whisper_cer_insert: insertion errors -# --faster_whisper_cer_replace: replacement errors -# --faster_whisper_cer_equal: correct matching words/character counts -- name: faster_whisper_wer +# --fwhisper_wer_delete: delete errors +# --fwhisper_wer_insert: insertion errors +# --fwhisper_wer_replace: replacement errors +# --fwhisper_wer_equal: correct matching words/character counts +# --fwhisper_cer_delete: delete errors +# --fwhisper_cer_insert: insertion errors +# --fwhisper_cer_replace: replacement errors +# --fwhisper_cer_equal: correct matching words/character counts +- name: fwhisper_wer model_tag: default beam_size: 5 batch_size: 1 compute_type: float32 text_cleaner: whisper_basic + # Word error rate with NeMo asr model +# Please refer to tools/install_nemo.sh for installing. # The default model is `nvidia/stt_en_conformer_transducer_xlarge`. # --nemo_hyp_text: the hypothesis from NeMo ASR decoding # --ref_text: reference text (after cleaner) diff --git a/setup.py b/setup.py index 73a719c..92a2c68 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,6 @@ "importlib-metadata", "kaggle", "kaldiio", - "jamo", "lazy_loader", "Levenshtein", "librosa", @@ -59,7 +58,6 @@ "espnet_model_zoo", "discrete-speech-metrics @ git+https://github.com/ftshijt/DiscreteSpeechMetrics.git@v1.0.2", "cdpam", - "nemo_toolkit[asr]" ], extras_require={ "dev": [ diff --git a/test/test_pipeline/test_general.py b/test/test_pipeline/test_general.py index 09a165b..5d55bc4 100755 --- a/test/test_pipeline/test_general.py +++ b/test/test_pipeline/test_general.py @@ -40,6 +40,8 @@ "torch_squim_stoi": 0.6027805209159851, "torch_squim_pesq": 1.1683127880096436, "torch_squim_si_sdr": -11.109052658081055, + "dpam_distance": 0.15004253387451172, + "cdpam_distance": 0.05146043747663498, "dnsmos_pro_bvcc": 1.1717286109924316, "dnsmos_pro_nisqa": 1.4733699560165405, "dnsmos_pro_vcc2018": 1.930935263633728, diff --git a/test/test_pipeline/test_wer.py b/test/test_pipeline/test_wer.py index 2d14821..f358e4f 100755 --- a/test/test_pipeline/test_wer.py +++ b/test/test_pipeline/test_wer.py @@ -15,7 +15,7 @@ "espnet_wer_equal": 1, "owsm_wer_equal": 1, "whisper_wer_equal": 1, - "faster_whisper_wer_equal": 1, + "fwhisper_wer_equal": 1, "nemo_wer_equal": 1, "hubert_wer_equal": 1, } diff --git a/tools/install_faster-whisper.sh b/tools/install_fwhisper.sh similarity index 99% rename from tools/install_faster-whisper.sh rename to tools/install_fwhisper.sh index 45a7f81..eed9d70 100644 --- a/tools/install_faster-whisper.sh +++ b/tools/install_fwhisper.sh @@ -1,4 +1,5 @@ #!/bin/bash + set -e pip install faster-whisper diff --git a/tools/install_nemo.sh b/tools/install_nemo.sh new file mode 100644 index 0000000..431df50 --- /dev/null +++ b/tools/install_nemo.sh @@ -0,0 +1,5 @@ +#/bin/bash + +# NOTE(Haoran): Toolkit for nemo_wer + +pip install "nemo_toolkit[asr]" \ No newline at end of file diff --git a/versa/__init__.py b/versa/__init__.py index 7f8ca42..46531d2 100644 --- a/versa/__init__.py +++ b/versa/__init__.py @@ -55,9 +55,9 @@ whisper_levenshtein_metric, whisper_wer_setup, ) -from versa.corpus_metrics.faster_whisper_wer import ( - faster_whisper_levenshtein_metric, - faster_whisper_wer_setup, +from versa.corpus_metrics.fwhisper_wer import ( + fwhisper_levenshtein_metric, + fwhisper_wer_setup, ) from versa.corpus_metrics.nemo_wer import ( nemo_levenshtein_metric, diff --git a/versa/corpus_metrics/faster_whisper_wer.py b/versa/corpus_metrics/fwhisper_wer.py similarity index 72% rename from versa/corpus_metrics/faster_whisper_wer.py rename to versa/corpus_metrics/fwhisper_wer.py index fe07a7d..64cc7e2 100644 --- a/versa/corpus_metrics/faster_whisper_wer.py +++ b/versa/corpus_metrics/fwhisper_wer.py @@ -23,7 +23,7 @@ TARGET_FS = 16000 -def faster_whisper_wer_setup( +def fwhisper_wer_setup( model_tag="default", beam_size=5, batch_size=1, compute_type="float32" ,text_cleaner="whisper_basic", use_gpu=True ): if model_tag == "default": @@ -43,7 +43,7 @@ def faster_whisper_wer_setup( return wer_utils -def faster_whisper_levenshtein_metric( +def fwhisper_levenshtein_metric( wer_utils, pred_x, ref_text, fs=16000, cache_pred_text=None ): """Calculate the Levenshtein distance between ref and inf ASR results. @@ -86,53 +86,53 @@ def faster_whisper_levenshtein_metric( ref_words = ref_text.strip().split() pred_words = pred_text.strip().split() ret = { - "faster_whisper_hyp_text": pred_text, + "fwhisper_hyp_text": pred_text, "ref_text": ref_text, - "faster_whisper_wer_delete": 0, - "faster_whisper_wer_insert": 0, - "faster_whisper_wer_replace": 0, - "faster_whisper_wer_equal": 0, + "fwhisper_wer_delete": 0, + "fwhisper_wer_insert": 0, + "fwhisper_wer_replace": 0, + "fwhisper_wer_equal": 0, } for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): if op == "insert": - ret["faster_whisper_wer_" + op] = ret["faster_whisper_wer_" + op] + inf_et - inf_st + ret["fwhisper_wer_" + op] = ret["fwhisper_wer_" + op] + inf_et - inf_st else: - ret["faster_whisper_wer_" + op] = ret["faster_whisper_wer_" + op] + ref_et - ref_st + ret["fwhisper_wer_" + op] = ret["fwhisper_wer_" + op] + ref_et - ref_st total = ( - ret["faster_whisper_wer_delete"] - + ret["faster_whisper_wer_replace"] - + ret["faster_whisper_wer_equal"] + ret["fwhisper_wer_delete"] + + ret["fwhisper_wer_replace"] + + ret["fwhisper_wer_equal"] ) assert total == len(ref_words), (total, len(ref_words)) total = ( - ret["faster_whisper_wer_insert"] - + ret["faster_whisper_wer_replace"] - + ret["faster_whisper_wer_equal"] + ret["fwhisper_wer_insert"] + + ret["fwhisper_wer_replace"] + + ret["fwhisper_wer_equal"] ) assert total == len(pred_words), (total, len(pred_words)) # process cer ref_words = [c for c in ref_text] pred_words = [c for c in pred_text] - ret["faster_whisper_cer_delete"] = 0 - ret["faster_whisper_cer_insert"] = 0 - ret["faster_whisper_cer_replace"] = 0 - ret["faster_whisper_cer_equal"] = 0 + ret["fwhisper_cer_delete"] = 0 + ret["fwhisper_cer_insert"] = 0 + ret["fwhisper_cer_replace"] = 0 + ret["fwhisper_cer_equal"] = 0 for op, ref_st, ref_et, inf_st, inf_et in opcodes(ref_words, pred_words): if op == "insert": - ret["faster_whisper_cer_" + op] = ret["faster_whisper_cer_" + op] + inf_et - inf_st + ret["fwhisper_cer_" + op] = ret["fwhisper_cer_" + op] + inf_et - inf_st else: - ret["faster_whisper_cer_" + op] = ret["faster_whisper_cer_" + op] + ref_et - ref_st + ret["fwhisper_cer_" + op] = ret["fwhisper_cer_" + op] + ref_et - ref_st total = ( - ret["faster_whisper_cer_delete"] - + ret["faster_whisper_cer_replace"] - + ret["faster_whisper_cer_equal"] + ret["fwhisper_cer_delete"] + + ret["fwhisper_cer_replace"] + + ret["fwhisper_cer_equal"] ) assert total == len(ref_words), (total, len(ref_words)) total = ( - ret["faster_whisper_cer_insert"] - + ret["faster_whisper_cer_replace"] - + ret["faster_whisper_cer_equal"] + ret["fwhisper_cer_insert"] + + ret["fwhisper_cer_replace"] + + ret["fwhisper_cer_equal"] ) assert total == len(pred_words), (total, len(pred_words)) @@ -141,9 +141,9 @@ def faster_whisper_levenshtein_metric( if __name__ == "__main__": a = np.random.random(16000) - wer_utils = faster_whisper_wer_setup() + wer_utils = fwhisper_wer_setup() print( "metrics: {}".format( - faster_whisper_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) + fwhisper_levenshtein_metric(wer_utils, a, "test a sentence.", 16000) ) ) diff --git a/versa/metrics.py b/versa/metrics.py index e4c5029..dd23572 100644 --- a/versa/metrics.py +++ b/versa/metrics.py @@ -31,7 +31,7 @@ "espnet_hyp_text", "owsm_hyp_text", "whisper_hyp_text", - "faster_whisper_hyp_text", + "fwhisper_hyp_text", "nemo_hyp_text", "hubert_hyp_text" ] @@ -123,16 +123,16 @@ "whisper_cer_insert", "whisper_cer_replace", "whisper_cer_equal", - "faster_whisper_wer", - "faster_whisper_wer_delete", - "faster_whisper_wer_insert", - "faster_whisper_wer_replace", - "faster_whisper_wer_equal", - "faster_whisper_cer", - "faster_whisper_cer_delete", - "faster_whisper_cer_insert", - "faster_whisper_cer_replace", - "faster_whisper_cer_equal", + "fwhisper_wer", + "fwhisper_wer_delete", + "fwhisper_wer_insert", + "fwhisper_wer_replace", + "fwhisper_wer_equal", + "fwhisper_cer", + "fwhisper_cer_delete", + "fwhisper_cer_insert", + "fwhisper_cer_replace", + "fwhisper_cer_equal", "nemo_wer", "nemo_wer_delete", "nemo_wer_insert", diff --git a/versa/scorer_shared.py b/versa/scorer_shared.py index 4b768b0..c843f98 100644 --- a/versa/scorer_shared.py +++ b/versa/scorer_shared.py @@ -361,35 +361,26 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal "args": args_cache, } logging.info("Initiate Whisper WER calculation successfully") - elif config["name"] == "faster_whisper_wer": + elif config["name"] == "fwhisper_wer": if not use_gt_text: - logging.warning("Cannot use faster_whisper_wer because no gt text is provided") + logging.warning("Cannot use fwhisper_wer because no gt text is provided") continue - logging.info("Loading faster_whisper_wer metric with reference text") - from versa import faster_whisper_levenshtein_metric, faster_whisper_wer_setup + logging.info("Loading fwhisper_wer metric with reference text") + from versa import fwhisper_levenshtein_metric, fwhisper_wer_setup - # Load whisper model if it is already loaded - if ( - "speaking_rate" in score_modules.keys() - or "asr_matching" in score_modules.keys() - ): - args_cache = score_modules["speaking_rate"]["args"] - else: - args_cache = faster_whisper_wer_setup( + score_modules["fwhisper_wer"] = { + "module": fwhisper_levenshtein_metric, + "args": fwhisper_wer_setup( model_tag=config.get("model_tag", "default"), beam_size=config.get("beam_size", 1), batch_size=config.get("batch_size", 1), compute_type=config.get("compute_type", "float32"), text_cleaner=config.get("text_cleaner", "whisper_basic"), use_gpu=use_gpu, - ) - - score_modules["faster_whisper_wer"] = { - "module": faster_whisper_levenshtein_metric, - "args": args_cache, + ), } - logging.info("Initiate faster_whisper WER calculation successfully") + logging.info("Initiate fwhisper WER calculation successfully") elif config["name"] == "nemo_wer": if not use_gt_text: logging.warning("Cannot use nemo_wer because no gt text is provided") @@ -398,22 +389,13 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal logging.info("Loading nemo_wer metric with reference text") from versa import nemo_levenshtein_metric, nemo_wer_setup - # Load nemo asr model if it is already loaded - if ( - "speaking_rate" in score_modules.keys() - or "asr_matching" in score_modules.keys() - ): - args_cache = score_modules["speaking_rate"]["args"] - else: - args_cache = nemo_wer_setup( + score_modules["nemo_wer"] = { + "module": nemo_levenshtein_metric, + "args": nemo_wer_setup( model_tag=config.get("model_tag", "default"), text_cleaner=config.get("text_cleaner", "whisper_basic"), use_gpu=use_gpu, - ) - - score_modules["nemo_wer"] = { - "module": nemo_levenshtein_metric, - "args": args_cache, + ), } logging.info("Initiate NeMo WER calculation successfully") elif config["name"] == "hubert_wer": @@ -424,22 +406,13 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal logging.info("Loading hubert_wer metric with reference text") from versa import hubert_levenshtein_metric, hubert_wer_setup - # Load hubert asr model if it is already loaded - if ( - "speaking_rate" in score_modules.keys() - or "asr_matching" in score_modules.keys() - ): - args_cache = score_modules["speaking_rate"]["args"] - else: - args_cache = hubert_wer_setup( + score_modules["hubert_wer"] = { + "module": hubert_levenshtein_metric, + "args": hubert_wer_setup( model_tag=config.get("model_tag", "default"), text_cleaner=config.get("text_cleaner", "whisper_basic"), use_gpu=use_gpu, - ) - - score_modules["hubert_wer"] = { - "module": hubert_levenshtein_metric, - "args": args_cache, + ), } logging.info("Initiate hubert WER calculation successfully") elif config["name"] == "scoreq_ref": @@ -1081,7 +1054,7 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None): score = score_modules[key]["module"]( score_modules[key]["model"], gen_wav, gt_wav, gen_sr ) - elif key in ["espnet_wer", "owsm_wer", "whisper_wer", "faster_whisper_wer", "nemo_wer", "hubert_wer"]: + elif key in ["espnet_wer", "owsm_wer", "whisper_wer", "fwhisper_wer", "nemo_wer", "hubert_wer"]: score = score_modules[key]["module"]( score_modules[key]["args"], gen_wav,