From 4ac54060a9b3f6b19dce5934fc494dc9fafaec1f Mon Sep 17 00:00:00 2001 From: Nik Vaessen Date: Sun, 2 Feb 2025 21:02:16 -0500 Subject: [PATCH] add more docs on new features --- docs/usage.md | 70 ++++++++++++++++++++++++++++++++++++++++-- src/jiwer/alignment.py | 40 ++++++++++++++++++++++-- 2 files changed, 104 insertions(+), 6 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 4cdfc4f..ce605d2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,5 +1,7 @@ # Usage +## Word error rate + The most simple use-case is computing the word error rate between two strings: ```python @@ -41,6 +43,8 @@ hypothesis = ["hello duck", "i like python"] error = wer(reference, hypothesis) ``` +## Character error rate + We also provide the character error rate: ```python @@ -56,7 +60,7 @@ output = jiwer.process_characters(reference, hypothesis) error = output.cer ``` -# Alignment +## Alignment With `jiwer.process_words` and `jiwer.process_characters`, you get the alignment between the reference and hypothesis. @@ -88,16 +92,19 @@ print(jiwer.visualize_alignment(out)) ``` Gives the following output ```text -sentence 1 +=== SENTENCE 1 === + REF: **** short one here HYP: shoe order one **** I S D -sentence 2 +=== SENTENCE 2 === + REF: quite a bit of ** **** longer sentence **** HYP: quite * bit of an even longest sentence here D I I S I +=== SUMMARY === number of sentences: 2 substitutions=2 deletions=2 insertions=4 hits=5 @@ -108,3 +115,60 @@ wer=88.89% ``` Note that it also possible to visualize the character-level alignment, simply use the output of `jiwer.process_characters()` instead. + +## Error frequencies + +You can list all the substitutions, insertions, and deletion, along with their frequencies: + +```python3 +import jiwer + +out = jiwer.process_words( + ["short one here", "quite a bit of longer sentence"], + ["shoe order one", "quite bit of an even longest sentence here"], +) + +print(jiwer.visualize_error_counts(out)) +``` + +Will return +```text +=== SUBSTITUTIONS === +short --> order = 1x +longer --> longest = 1x + +=== INSERTIONS === +shoe = 1x +an even = 1x +here = 1x + +=== DELETIONS === +here = 1x +a = 1x +``` + +## Transformations + +You can apply transformations to reference or hypothesis strings before the calculation of various metrics +with the transform API. For all available, transformations, see [here](/jiwer/reference/transforms/). +For the default transformations, see [here](/jiwer/reference/transformations/). + +An example of the transformation API: + +```python3 +import jiwer + +tr = jiwer.Compose([ + jiwer.RemoveMultipleSpaces(), + jiwer.Strip(), + jiwer.SubstituteWords({"I'm": 'i am'}), + jiwer.ReduceToListOfListOfWords() +]) + +out = jiwer.process_words( + "I'm good", + "i am bad", + reference_transform=tr, + hypothesis_transform=tr +) +``` diff --git a/src/jiwer/alignment.py b/src/jiwer/alignment.py index 7787f35..24e7397 100644 --- a/src/jiwer/alignment.py +++ b/src/jiwer/alignment.py @@ -53,6 +53,7 @@ def visualize_alignment( Example: This code snippet + ```python import jiwer @@ -63,7 +64,9 @@ def visualize_alignment( print(jiwer.visualize_alignment(out)) ``` + will produce this visualization: + ```txt === SENTENCE 1 === @@ -71,7 +74,7 @@ def visualize_alignment( HYP: shoe order one * I S D - === sentence 2 === + === SENTENCE 2 === REF: quite a bit of # # longer sentence # HYP: quite * bit of an even longest sentence here @@ -97,6 +100,7 @@ def visualize_alignment( I S D === SENTENCE 2 === + REF: quite a bit of # # longer sentence # HYP: quite * bit of an even longest sentence here D I I S I @@ -106,6 +110,7 @@ def visualize_alignment( ```txt === SENTENCE 1 === + REF: This is a very long sentence that is *** much longer than the previous one HYP: This is a very loong sentence that is not much longer than the previous one S I @@ -268,14 +273,43 @@ def visualize_error_counts( Visualize which words (or characters), and how often, were substituted, inserted, or deleted. Args: - output: + output: The processed output of reference and hypothesis pair(s). show_substitutions: If true, visualize substitution errors. show_insertions: If true, visualize insertion errors. show_deletions: If true, visualize deletion errors. top_k: If set, only visualize the k most frequent errors. - Returns: A string which visualizes the words/characters and their frequencies. + Returns: + (str): A string which visualizes the words/characters and their frequencies. + + Example: + The code snippet + ```python3 + import jiwer + + out = jiwer.process_words( + ["short one here", "quite a bit of longer sentence"], + ["shoe order one", "quite bit of an even longest sentence here"], + ) + print(jiwer.visualize_error_counts(out)) + ``` + + will print the following: + ```txt + === SUBSTITUTIONS === + short --> order = 1x + longer --> longest = 1x + + === INSERTIONS === + shoe = 1x + an even = 1x + here = 1x + + === DELETIONS === + here = 1x + a = 1x + ``` """ s, i, d = collect_error_counts(output)