From 51156fa1e4a7039cbaa3004159aec4246e8593cf Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Fri, 20 Feb 2026 21:06:03 +0100 Subject: [PATCH 01/16] Add ACHAP evaluation metrics --- pyproject.toml | 4 +- src/mcif/evaluation.py | 92 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a443d9f..95afcc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,9 @@ dependencies = [ "jiwer==3.0.5", "bert_score==0.3.13", "unbabel-comet==2.2.4", - "whisper_normalizer==0.0.10" + "whisper_normalizer==0.0.10", + "mutagen>=1.47", + "chunkseg[align,titles]==0.3.1" ] dynamic = ["version"] diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 1740486..8cf3e48 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -44,6 +44,9 @@ CHAR_LEVEL_LANGS = {"zh"} +# ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment +_CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"} + @dataclass class ReferenceSample: @@ -281,6 +284,86 @@ def score_st( return comet_score(comet_data) +def _audio_duration(audio_path: str) -> float: + """Return audio duration in seconds using mutagen (metadata-only, no decoding).""" + from mutagen import File + return File(audio_path).info.length + + +def score_achap( + hypo_dict: Dict[str, str], + ref_dict: Dict[str, Dict[str, ReferenceSample]], + lang: str) -> Dict[str, float]: + """ + Computes chunkseg metrics for audio chaptering (ACHAP): collar-based F1, time-chunk F1, WER, and title evaluation. + + Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives + boundary timestamps and title time associations via forced alignment internally. + + Reference XML format: + : JSON [[title, start_seconds], ...] + : path to audio file + : reference transcript text (optional; enables WER) + """ + import json + from chunkseg import evaluate_batch + + chunkseg_lang = _CHUNKSEG_LANG.get(lang, "eng") + samples = [] + has_transcript = False + + for iid, ref_sample in ref_dict["ACHAP"].items(): + assert len(ref_sample.sample_ids) == 1, \ + f"ACHAP reference (IID: {iid}) mapped to multiple sample IDs: " \ + f"{ref_sample.sample_ids}" + hypo_text = hypo_dict[ref_sample.sample_ids[0]] + ref_chapters = json.loads(ref_sample.reference) # [[title, start_sec], ...] + ref_titles = [(t, float(s)) for t, s in ref_chapters] + ref_boundaries = [float(s) for _, s in ref_chapters] + audio_path = ref_sample.metadata["audio_path"] + duration = _audio_duration(audio_path) + transcript = ref_sample.metadata.get("transcript") + if transcript is not None: + has_transcript = True + + sample = { + "hypothesis": hypo_text, + "reference": ref_boundaries, + "duration": duration, + "audio": audio_path, + "reference_titles": ref_titles, + } + if transcript is not None: + sample["reference_transcript"] = transcript + samples.append(sample) + + if not samples: + return {} + + results = evaluate_batch( + samples, + format="markdown", + lang=chunkseg_lang, + titles=True, + wer=has_transcript, + collar=3.0, + tolerance=5.0, + ) + + def _mean(key): + return results.get(key, {}).get("mean", 0.0) + + out = { + "collar_f1": _mean("collar_f1"), + "tm_bs_f1": _mean("tm_bs_f1"), + "gc_bs_f1": _mean("gc_bs_f1"), + "tm_matched": _mean("tm_matched"), + } + if has_transcript: + out["wer"] = _mean("wer") + return out + + def main( hypo_path: Path, ref_path: Path, @@ -310,7 +393,6 @@ def main( assert "TRANS" in ref.keys() scores["TRANS-COMET"] = score_st(hypo, ref, lang) else: - assert len(ref.keys()) == 3 or len(ref.keys()) == 2 assert "SUM" in ref.keys() scores["SUM-BERTScore"] = score_ssum(hypo, ref, lang) if lang == "en": @@ -319,6 +401,14 @@ def main( else: assert "TRANS" in ref.keys() scores["TRANS-COMET"] = score_st(hypo, ref, lang) + if "ACHAP" in ref.keys(): + achap = score_achap(hypo, ref, lang) + scores["ACHAP-CollarF1"] = achap.get("collar_f1", 0.0) + scores["ACHAP-TM-BS"] = achap.get("tm_bs_f1", 0.0) + scores["ACHAP-GC-BS"] = achap.get("gc_bs_f1", 0.0) + scores["ACHAP-TM-MATCHED"] = achap.get("tm_matched", 0.0) + if "wer" in achap: + scores["ACHAP-WER"] = achap["wer"] return scores From 8c69149c9bf8ab0acd49f46d8d14f241524b55ae Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Mon, 23 Feb 2026 15:43:45 +0100 Subject: [PATCH 02/16] Improve clarity and documentation --- src/mcif/evaluation.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 8cf3e48..a5e07b7 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -47,7 +47,6 @@ # ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment _CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"} - @dataclass class ReferenceSample: sample_ids: List[str] @@ -295,11 +294,19 @@ def score_achap( ref_dict: Dict[str, Dict[str, ReferenceSample]], lang: str) -> Dict[str, float]: """ - Computes chunkseg metrics for audio chaptering (ACHAP): collar-based F1, time-chunk F1, WER, and title evaluation. + Computes chunkseg metrics for audio chaptering (ACHAP): + - Collar-based F1 (±3s collar): Comparing predicted chapter timestamps with reference timestamps with tolerance + - BERTScore for titles, with two different strategies + - Global Concatenation; concatenated predicted vs reference titles + - Temporally Matched; only comparing titles of predicted sections temporally matching reference sections + - WER: word error rate, for the transcript generated alongside (optional) Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives boundary timestamps and title time associations via forced alignment internally. + Following the work of: + `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering" `_ + Reference XML format: : JSON [[title, start_seconds], ...] : path to audio file @@ -404,8 +411,8 @@ def main( if "ACHAP" in ref.keys(): achap = score_achap(hypo, ref, lang) scores["ACHAP-CollarF1"] = achap.get("collar_f1", 0.0) - scores["ACHAP-TM-BS"] = achap.get("tm_bs_f1", 0.0) - scores["ACHAP-GC-BS"] = achap.get("gc_bs_f1", 0.0) + scores["ACHAP-TM-BERTScore"] = achap.get("tm_bs_f1", 0.0) + scores["ACHAP-GC-BERTScore"] = achap.get("gc_bs_f1", 0.0) scores["ACHAP-TM-MATCHED"] = achap.get("tm_matched", 0.0) if "wer" in achap: scores["ACHAP-WER"] = achap["wer"] From 0424bd6d10d918ec53aa6da9942c7785448ff423 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Tue, 24 Feb 2026 12:03:33 +0100 Subject: [PATCH 03/16] More concise, fix linting issues --- src/mcif/evaluation.py | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index a5e07b7..e0f2bee 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -26,6 +26,7 @@ import bert_score import jiwer +from chunkseg import evaluate_batch from comet import download_model, load_from_checkpoint from whisper_normalizer import english, basic @@ -295,11 +296,11 @@ def score_achap( lang: str) -> Dict[str, float]: """ Computes chunkseg metrics for audio chaptering (ACHAP): - - Collar-based F1 (±3s collar): Comparing predicted chapter timestamps with reference timestamps with tolerance - - BERTScore for titles, with two different strategies - - Global Concatenation; concatenated predicted vs reference titles - - Temporally Matched; only comparing titles of predicted sections temporally matching reference sections - - WER: word error rate, for the transcript generated alongside (optional) + - Collar-based F1 (±3s collar): predicted vs reference timestamps with tolerance + - BERTScore for titles, with two different strategies: + - Global Concatenation: concatenated predicted vs reference titles + - Temporally Matched: titles of predicted sections matching reference sections + - WER: word error rate for the transcript generated alongside (optional) Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives boundary timestamps and title time associations via forced alignment internally. @@ -312,9 +313,6 @@ def score_achap( : path to audio file : reference transcript text (optional; enables WER) """ - import json - from chunkseg import evaluate_batch - chunkseg_lang = _CHUNKSEG_LANG.get(lang, "eng") samples = [] has_transcript = False @@ -357,17 +355,14 @@ def score_achap( tolerance=5.0, ) - def _mean(key): - return results.get(key, {}).get("mean", 0.0) - out = { - "collar_f1": _mean("collar_f1"), - "tm_bs_f1": _mean("tm_bs_f1"), - "gc_bs_f1": _mean("gc_bs_f1"), - "tm_matched": _mean("tm_matched"), + "ACHAP-CollarF1": results["collar_f1"]["mean"], + "ACHAP-TM-BERTScore": results["tm_bs_f1"]["mean"], + "ACHAP-GC-BERTScore": results["gc_bs_f1"]["mean"], + "ACHAP-TM-MATCHED": results["tm_matched"]["mean"], } if has_transcript: - out["wer"] = _mean("wer") + out["ACHAP-WER"] = results["wer"]["mean"] return out @@ -409,13 +404,7 @@ def main( assert "TRANS" in ref.keys() scores["TRANS-COMET"] = score_st(hypo, ref, lang) if "ACHAP" in ref.keys(): - achap = score_achap(hypo, ref, lang) - scores["ACHAP-CollarF1"] = achap.get("collar_f1", 0.0) - scores["ACHAP-TM-BERTScore"] = achap.get("tm_bs_f1", 0.0) - scores["ACHAP-GC-BERTScore"] = achap.get("gc_bs_f1", 0.0) - scores["ACHAP-TM-MATCHED"] = achap.get("tm_matched", 0.0) - if "wer" in achap: - scores["ACHAP-WER"] = achap["wer"] + scores.update(score_achap(hypo, ref, lang)) return scores From 36845cdc332d0f0ee666ab6502eea31f6a2e2671 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Tue, 24 Feb 2026 12:32:03 +0100 Subject: [PATCH 04/16] Fix remaining linting issues --- src/mcif/evaluation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index e0f2bee..ab7b736 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -48,6 +48,7 @@ # ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment _CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"} + @dataclass class ReferenceSample: sample_ids: List[str] @@ -306,7 +307,8 @@ def score_achap( boundary timestamps and title time associations via forced alignment internally. Following the work of: - `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering" `_ + `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering" + `_ Reference XML format: : JSON [[title, start_seconds], ...] From 3cf8154329e4ba6a92e4e40f485fa2121b6af638 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Wed, 11 Mar 2026 22:15:06 +0100 Subject: [PATCH 05/16] chunkseg_lang: no default, direct indexing --- src/mcif/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index ab7b736..29acb84 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -315,7 +315,7 @@ def score_achap( : path to audio file : reference transcript text (optional; enables WER) """ - chunkseg_lang = _CHUNKSEG_LANG.get(lang, "eng") + chunkseg_lang = _CHUNKSEG_LANG[lang] samples = [] has_transcript = False From 5e66bcedf1535d050594434ea446e0f7547cc942 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Fri, 13 Mar 2026 17:49:17 +0100 Subject: [PATCH 06/16] Inject audio path from source into reference metadata --- src/mcif/evaluation.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 29acb84..487c9e2 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -1,3 +1,4 @@ + # Copyright 2025 FBK, KIT # Licensed under the Apache License, Version 2.0 (the "License"); @@ -141,8 +142,17 @@ def read_reference( ref_path: Path, track: str, language: str, - modality: Optional[str] = None) -> Dict[str, Dict[str, ReferenceSample]]: + modality: Optional[str] = None, + hypo_path: Optional[Path] = None) -> Dict[str, Dict[str, ReferenceSample]]: xml = ET.parse(ref_path) + audio_paths = {} + if hypo_path is not None: + hypo_xml = ET.parse(hypo_path) + for task in hypo_xml.getroot().iter("task"): + if task.attrib['track'] == track and task.attrib['text_lang'] == language: + for s in task.iter("sample"): + audio_paths[s.attrib['id']] = s.find('audio_path').text + break avail_tasks = [] for task in xml.getroot().iter("task"): if task.attrib['track'] == track and task.attrib['text_lang'] == language: @@ -160,6 +170,9 @@ def read_reference( for field in ['qa_type', 'qa_origin']: if field in sample.attrib: sample_metadata[field] = sample.attrib[field] + for sid in sample_ids: + if sid in audio_paths: + sample_metadata['audio_path'] = audio_paths[sid] samples_by_subtask[sample.attrib['task']][sample.attrib['iid']] = \ ReferenceSample(sample_ids, sample_reference, sample_metadata) return samples_by_subtask @@ -312,7 +325,6 @@ def score_achap( Reference XML format: : JSON [[title, start_seconds], ...] - : path to audio file : reference transcript text (optional; enables WER) """ chunkseg_lang = _CHUNKSEG_LANG[lang] @@ -379,7 +391,8 @@ def main( Main function computing all the scores and returning a Dictionary with the scores """ hypo = read_hypo(hypo_path, track, lang) - ref = read_reference(ref_path, track, lang, modality=filter_modality) + ref = read_reference(ref_path, track, lang, modality=filter_modality, + hypo_path=hypo_path) scores = {} assert "QA" in ref.keys() scores["QA-BERTScore"], qa_types_scores = score_sqa(hypo, ref, lang, breakdown_qa_types) From 42cfbdfbceb1f5c59ec706cc6c70883d4d45ff18 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Fri, 13 Mar 2026 19:33:29 +0100 Subject: [PATCH 07/16] Assume audio_path in reference file --- src/mcif/evaluation.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 487c9e2..b819715 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -1,4 +1,3 @@ - # Copyright 2025 FBK, KIT # Licensed under the Apache License, Version 2.0 (the "License"); @@ -142,17 +141,8 @@ def read_reference( ref_path: Path, track: str, language: str, - modality: Optional[str] = None, - hypo_path: Optional[Path] = None) -> Dict[str, Dict[str, ReferenceSample]]: + modality: Optional[str] = None) -> Dict[str, Dict[str, ReferenceSample]]: xml = ET.parse(ref_path) - audio_paths = {} - if hypo_path is not None: - hypo_xml = ET.parse(hypo_path) - for task in hypo_xml.getroot().iter("task"): - if task.attrib['track'] == track and task.attrib['text_lang'] == language: - for s in task.iter("sample"): - audio_paths[s.attrib['id']] = s.find('audio_path').text - break avail_tasks = [] for task in xml.getroot().iter("task"): if task.attrib['track'] == track and task.attrib['text_lang'] == language: @@ -163,16 +153,13 @@ def read_reference( samples_by_subtask[sample.attrib['task']] = {} sample_ids = sample.attrib['id'].split(",") sample_reference = next(sample.iter('reference')).text - sample_metadata = {} + sample_metadata = {'audio_path': next(sample.iter('audio_path')).text} for metadata in sample.iter('metadata'): for metadata_field in metadata.iter(): sample_metadata[metadata_field.tag] = metadata_field.text for field in ['qa_type', 'qa_origin']: if field in sample.attrib: sample_metadata[field] = sample.attrib[field] - for sid in sample_ids: - if sid in audio_paths: - sample_metadata['audio_path'] = audio_paths[sid] samples_by_subtask[sample.attrib['task']][sample.attrib['iid']] = \ ReferenceSample(sample_ids, sample_reference, sample_metadata) return samples_by_subtask @@ -391,8 +378,7 @@ def main( Main function computing all the scores and returning a Dictionary with the scores """ hypo = read_hypo(hypo_path, track, lang) - ref = read_reference(ref_path, track, lang, modality=filter_modality, - hypo_path=hypo_path) + ref = read_reference(ref_path, track, lang, modality=filter_modality) scores = {} assert "QA" in ref.keys() scores["QA-BERTScore"], qa_types_scores = score_sqa(hypo, ref, lang, breakdown_qa_types) From e321f3adcdc9721503601a94308eaf5aaf01d50f Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Mon, 16 Mar 2026 18:04:20 +0100 Subject: [PATCH 08/16] Fix LookupError by updating chunkseg dependency version to 0.3.2 to --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 95afcc9..0487032 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "unbabel-comet==2.2.4", "whisper_normalizer==0.0.10", "mutagen>=1.47", - "chunkseg[align,titles]==0.3.1" + "chunkseg[align,titles]==0.3.2" ] dynamic = ["version"] From 244e06b6f457ecf603e4f0ea64a1052ca45ced94 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Wed, 18 Mar 2026 18:49:50 +0100 Subject: [PATCH 09/16] Use relative path instead of absolute path, chunkseg version bump --- pyproject.toml | 2 +- src/mcif/evaluation.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 95afcc9..b176f55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "unbabel-comet==2.2.4", "whisper_normalizer==0.0.10", "mutagen>=1.47", - "chunkseg[align,titles]==0.3.1" + "chunkseg[align,titles]==0.3.3" ] dynamic = ["version"] diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index b819715..472827e 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -292,6 +292,7 @@ def _audio_duration(audio_path: str) -> float: def score_achap( + base_ref_path: Path, hypo_dict: Dict[str, str], ref_dict: Dict[str, Dict[str, ReferenceSample]], lang: str) -> Dict[str, float]: @@ -326,8 +327,8 @@ def score_achap( ref_chapters = json.loads(ref_sample.reference) # [[title, start_sec], ...] ref_titles = [(t, float(s)) for t, s in ref_chapters] ref_boundaries = [float(s) for _, s in ref_chapters] - audio_path = ref_sample.metadata["audio_path"] - duration = _audio_duration(audio_path) + audio_path = base_ref_path / "LONG_AUDIOS" / ref_sample.metadata["audio_path"] + duration = _audio_duration(audio_path.absolute().as_posix()) transcript = ref_sample.metadata.get("transcript") if transcript is not None: has_transcript = True @@ -336,7 +337,7 @@ def score_achap( "hypothesis": hypo_text, "reference": ref_boundaries, "duration": duration, - "audio": audio_path, + "audio": audio_path.absolute().as_posix(), "reference_titles": ref_titles, } if transcript is not None: @@ -405,7 +406,7 @@ def main( assert "TRANS" in ref.keys() scores["TRANS-COMET"] = score_st(hypo, ref, lang) if "ACHAP" in ref.keys(): - scores.update(score_achap(hypo, ref, lang)) + scores.update(score_achap(ref_path.parent, hypo, ref, lang)) return scores From 5edab8defd4b325e09fd9a0a9f615a423ab646c6 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Thu, 19 Mar 2026 18:55:45 +0100 Subject: [PATCH 10/16] Handle crosslingual evaluation by aligning hypothesis translation with reference transcript and using the latter for forced alignment --- pyproject.toml | 2 +- src/mcif/evaluation.py | 74 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b176f55..b40193e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "unbabel-comet==2.2.4", "whisper_normalizer==0.0.10", "mutagen>=1.47", - "chunkseg[align,titles]==0.3.3" + "chunkseg[align,titles]==0.3.4" ] dynamic = ["version"] diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 472827e..ea787d4 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -27,6 +27,7 @@ import bert_score import jiwer from chunkseg import evaluate_batch +from chunkseg.parsers import parse_transcript from comet import download_model, load_from_checkpoint from whisper_normalizer import english, basic @@ -291,6 +292,56 @@ def _audio_duration(audio_path: str) -> float: return File(audio_path).info.length +def _replace_translation_with_transcript( + hypo_text: str, + gold_translation: str, + ref_transcript: str, + target_lang: str) -> str: + """Replace translated hypothesis body with English transcript via mwerSegmenter.""" + parsed = parse_transcript(hypo_text, "markdown") + titles = parsed.titles or [] + sections = parsed.sections or [] + + if not titles or not sections: + return hypo_text + + section_texts = [" ".join(sents) for sents in sections] + full_hyp = " ".join(section_texts) + + gold_lines = [l for l in gold_translation.strip().split("\n") if l.strip()] + ref_lines = [l for l in ref_transcript.strip().split("\n") if l.strip()] + assert len(gold_lines) == len(ref_lines), \ + f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \ + f"line counts differ" + + segmenter = MwerSegmenter(character_level=(target_lang in CHAR_LEVEL_LANGS)) + reseg = segmenter(full_hyp, gold_lines) + + section_ends, pos = [], 0 + for t in section_texts: + pos += len(t) + section_ends.append(pos) + pos += 1 + + section_ref: List[List[str]] = [[] for _ in titles] + hyp_pos, sec_idx = 0, 0 + for i, seg in enumerate(reseg): + seg = seg.strip() + if not seg: + continue + found = full_hyp.find(seg, hyp_pos) + mid = found + len(seg) // 2 if found >= 0 else hyp_pos + if found >= 0: + hyp_pos = found + len(seg) + while sec_idx < len(section_ends) - 1 and mid >= section_ends[sec_idx]: + sec_idx += 1 + section_ref[sec_idx].append(ref_lines[i]) + + return "\n".join( + f"# {t}\n{' '.join(r)}\n" for t, r in zip(titles, section_ref) + ).strip() + + def score_achap( base_ref_path: Path, hypo_dict: Dict[str, str], @@ -307,15 +358,21 @@ def score_achap( Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives boundary timestamps and title time associations via forced alignment internally. + For crosslingual evaluation, the translated hypothesis is + aligned to the gold translation via mwerSegmenter and replaced with the + reference transcript before passing to chunkseg. + Following the work of: `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering" `_ Reference XML format: : JSON [[title, start_seconds], ...] - : reference transcript text (optional; enables WER) + : English reference transcript (optional; enables WER) + : gold translation, line-aligned with transcript + (crosslingual only) """ - chunkseg_lang = _CHUNKSEG_LANG[lang] + crosslingual = (lang != "en") samples = [] has_transcript = False @@ -330,7 +387,13 @@ def score_achap( audio_path = base_ref_path / "LONG_AUDIOS" / ref_sample.metadata["audio_path"] duration = _audio_duration(audio_path.absolute().as_posix()) transcript = ref_sample.metadata.get("transcript") - if transcript is not None: + translation = ref_sample.metadata.get("translation") + + if crosslingual and translation is not None and transcript is not None: + hypo_text = _replace_translation_with_transcript( + hypo_text, translation, transcript, lang) + + if transcript is not None and not crosslingual: has_transcript = True sample = { @@ -340,7 +403,7 @@ def score_achap( "audio": audio_path.absolute().as_posix(), "reference_titles": ref_titles, } - if transcript is not None: + if has_transcript: sample["reference_transcript"] = transcript samples.append(sample) @@ -350,7 +413,8 @@ def score_achap( results = evaluate_batch( samples, format="markdown", - lang=chunkseg_lang, + src_lang="eng", + tgt_lang=lang, titles=True, wer=has_transcript, collar=3.0, From f4f0b284b7da457b54a28e308b39beb178975340 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Thu, 19 Mar 2026 19:08:07 +0100 Subject: [PATCH 11/16] Fix linting issue --- src/mcif/evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index ea787d4..8c095fe 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -308,8 +308,8 @@ def _replace_translation_with_transcript( section_texts = [" ".join(sents) for sents in sections] full_hyp = " ".join(section_texts) - gold_lines = [l for l in gold_translation.strip().split("\n") if l.strip()] - ref_lines = [l for l in ref_transcript.strip().split("\n") if l.strip()] + gold_lines = [s for s in gold_translation.strip().split("\n") if s.strip()] + ref_lines = [s for s in ref_transcript.strip().split("\n") if s.strip()] assert len(gold_lines) == len(ref_lines), \ f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \ f"line counts differ" From bae4a9216e8f4da767bd75743e3e005ea0f3e7d7 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Fri, 20 Mar 2026 13:57:56 +0100 Subject: [PATCH 12/16] Update src/mcif/evaluation.py Co-authored-by: sarapapi <57095209+sarapapi@users.noreply.github.com> --- src/mcif/evaluation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 8c095fe..b860b39 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -369,8 +369,7 @@ def score_achap( Reference XML format: : JSON [[title, start_seconds], ...] : English reference transcript (optional; enables WER) - : gold translation, line-aligned with transcript - (crosslingual only) + : reference translation, line-aligned with transcript """ crosslingual = (lang != "en") samples = [] From 985ce3dc98c3812daa0448f4f7f8b3c752ed10bd Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Fri, 20 Mar 2026 15:45:41 +0100 Subject: [PATCH 13/16] Always require transcript --- src/mcif/evaluation.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index b860b39..2b71d01 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -353,7 +353,7 @@ def score_achap( - BERTScore for titles, with two different strategies: - Global Concatenation: concatenated predicted vs reference titles - Temporally Matched: titles of predicted sections matching reference sections - - WER: word error rate for the transcript generated alongside (optional) + - WER: word error rate for the transcript generated alongside Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives boundary timestamps and title time associations via forced alignment internally. @@ -368,12 +368,11 @@ def score_achap( Reference XML format: : JSON [[title, start_seconds], ...] - : English reference transcript (optional; enables WER) + : English reference transcript : reference translation, line-aligned with transcript """ crosslingual = (lang != "en") samples = [] - has_transcript = False for iid, ref_sample in ref_dict["ACHAP"].items(): assert len(ref_sample.sample_ids) == 1, \ @@ -385,25 +384,21 @@ def score_achap( ref_boundaries = [float(s) for _, s in ref_chapters] audio_path = base_ref_path / "LONG_AUDIOS" / ref_sample.metadata["audio_path"] duration = _audio_duration(audio_path.absolute().as_posix()) - transcript = ref_sample.metadata.get("transcript") - translation = ref_sample.metadata.get("translation") + transcript = ref_sample.metadata["transcript"] - if crosslingual and translation is not None and transcript is not None: + if crosslingual: + translation = ref_sample.metadata["translation"] hypo_text = _replace_translation_with_transcript( hypo_text, translation, transcript, lang) - if transcript is not None and not crosslingual: - has_transcript = True - sample = { "hypothesis": hypo_text, "reference": ref_boundaries, "duration": duration, "audio": audio_path.absolute().as_posix(), "reference_titles": ref_titles, + "reference_transcript": transcript, } - if has_transcript: - sample["reference_transcript"] = transcript samples.append(sample) if not samples: @@ -415,7 +410,7 @@ def score_achap( src_lang="eng", tgt_lang=lang, titles=True, - wer=has_transcript, + wer=not crosslingual, collar=3.0, tolerance=5.0, ) @@ -426,7 +421,7 @@ def score_achap( "ACHAP-GC-BERTScore": results["gc_bs_f1"]["mean"], "ACHAP-TM-MATCHED": results["tm_matched"]["mean"], } - if has_transcript: + if not crosslingual: out["ACHAP-WER"] = results["wer"]["mean"] return out From 93d8fc5bf37890949dcea5c3fc6c2d2a4ecff692 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Fri, 20 Mar 2026 15:59:27 +0100 Subject: [PATCH 14/16] Separate alignment into its own function --- src/mcif/evaluation.py | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 2b71d01..1397ee4 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -292,28 +292,21 @@ def _audio_duration(audio_path: str) -> float: return File(audio_path).info.length -def _replace_translation_with_transcript( +def _align_sections( hypo_text: str, - gold_translation: str, - ref_transcript: str, - target_lang: str) -> str: - """Replace translated hypothesis body with English transcript via mwerSegmenter.""" + gold_lines: List[str], + target_lang: str) -> Tuple[List[str], List[List[int]]]: + """Align hypothesis sections to gold translation lines via mwerSegmenter.""" parsed = parse_transcript(hypo_text, "markdown") titles = parsed.titles or [] sections = parsed.sections or [] if not titles or not sections: - return hypo_text + return titles, [[] for _ in titles] section_texts = [" ".join(sents) for sents in sections] full_hyp = " ".join(section_texts) - gold_lines = [s for s in gold_translation.strip().split("\n") if s.strip()] - ref_lines = [s for s in ref_transcript.strip().split("\n") if s.strip()] - assert len(gold_lines) == len(ref_lines), \ - f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \ - f"line counts differ" - segmenter = MwerSegmenter(character_level=(target_lang in CHAR_LEVEL_LANGS)) reseg = segmenter(full_hyp, gold_lines) @@ -323,7 +316,7 @@ def _replace_translation_with_transcript( section_ends.append(pos) pos += 1 - section_ref: List[List[str]] = [[] for _ in titles] + section_to_line_map: List[List[int]] = [[] for _ in titles] hyp_pos, sec_idx = 0, 0 for i, seg in enumerate(reseg): seg = seg.strip() @@ -335,7 +328,28 @@ def _replace_translation_with_transcript( hyp_pos = found + len(seg) while sec_idx < len(section_ends) - 1 and mid >= section_ends[sec_idx]: sec_idx += 1 - section_ref[sec_idx].append(ref_lines[i]) + section_to_line_map[sec_idx].append(i) + + return titles, section_to_line_map + + +def _replace_translation_with_transcript( + hypo_text: str, + gold_translation: str, + ref_transcript: str, + target_lang: str) -> str: + """Replace translated hypothesis body with reference transcript via mwerSegmenter.""" + gold_lines = [s for s in gold_translation.strip().split("\n") if s.strip()] + ref_lines = [s for s in ref_transcript.strip().split("\n") if s.strip()] + assert len(gold_lines) == len(ref_lines), \ + f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \ + f"line counts differ" + + titles, section_to_line_map = _align_sections(hypo_text, gold_lines, target_lang) + if not titles: + return hypo_text + + section_ref = [[ref_lines[i] for i in indices] for indices in section_to_line_map] return "\n".join( f"# {t}\n{' '.join(r)}\n" for t, r in zip(titles, section_ref) From 89d0bde005a98b72d2d2dcf065dee6450524b170 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Tue, 24 Mar 2026 13:22:33 +0100 Subject: [PATCH 15/16] Calculate ACHAP-COMET --- src/mcif/evaluation.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 1397ee4..84a70ea 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -367,7 +367,7 @@ def score_achap( - BERTScore for titles, with two different strategies: - Global Concatenation: concatenated predicted vs reference titles - Temporally Matched: titles of predicted sections matching reference sections - - WER: word error rate for the transcript generated alongside + - WER/COMET: quality measure for the transcript/translation generated alongside Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives boundary timestamps and title time associations via forced alignment internally. @@ -387,6 +387,7 @@ def score_achap( """ crosslingual = (lang != "en") samples = [] + comet_data = [] for iid, ref_sample in ref_dict["ACHAP"].items(): assert len(ref_sample.sample_ids) == 1, \ @@ -405,6 +406,16 @@ def score_achap( hypo_text = _replace_translation_with_transcript( hypo_text, translation, transcript, lang) + # Prepare COMET data + gold_lines = [s for s in translation.strip().split("\n") if s.strip()] + src_lines = [s for s in transcript.strip().split("\n") if s.strip()] + segmenter = MwerSegmenter(character_level=(lang in CHAR_LEVEL_LANGS)) + parsed = parse_transcript(hypo_dict[ref_sample.sample_ids[0]], "markdown") + flat = " ".join(" ".join(s) for s in (parsed.sections or [])) + reseg = segmenter(flat, gold_lines) + for mt, ref, src in zip(reseg, gold_lines, src_lines): + comet_data.append({"src": src.strip(), "mt": mt.strip(), "ref": ref.strip()}) + sample = { "hypothesis": hypo_text, "reference": ref_boundaries, @@ -435,7 +446,9 @@ def score_achap( "ACHAP-GC-BERTScore": results["gc_bs_f1"]["mean"], "ACHAP-TM-MATCHED": results["tm_matched"]["mean"], } - if not crosslingual: + if crosslingual: + out["ACHAP-COMET"] = comet_score(comet_data) + else: out["ACHAP-WER"] = results["wer"]["mean"] return out From ec53e8569d959998ee39934bb3fa40d70421a0f7 Mon Sep 17 00:00:00 2001 From: Fabian Retkowski Date: Tue, 24 Mar 2026 16:57:08 +0100 Subject: [PATCH 16/16] Remove unused language mapping --- src/mcif/evaluation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py index 84a70ea..764e4a9 100644 --- a/src/mcif/evaluation.py +++ b/src/mcif/evaluation.py @@ -46,9 +46,6 @@ CHAR_LEVEL_LANGS = {"zh"} -# ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment -_CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"} - @dataclass class ReferenceSample: