From 51156fa1e4a7039cbaa3004159aec4246e8593cf Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Fri, 20 Feb 2026 21:06:03 +0100
Subject: [PATCH 01/16] Add ACHAP evaluation metrics

---
 pyproject.toml         |  4 +-
 src/mcif/evaluation.py | 92 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index a443d9f..95afcc9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,9 @@ dependencies = [
     "jiwer==3.0.5",
     "bert_score==0.3.13",
     "unbabel-comet==2.2.4",
-    "whisper_normalizer==0.0.10"
+    "whisper_normalizer==0.0.10",
+    "mutagen>=1.47",
+    "chunkseg[align,titles]==0.3.1"
 ]
 
 dynamic = ["version"]
diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 1740486..8cf3e48 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -44,6 +44,9 @@
 
 CHAR_LEVEL_LANGS = {"zh"}
 
+# ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment
+_CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"}
+
 
 @dataclass
 class ReferenceSample:
@@ -281,6 +284,86 @@ def score_st(
     return comet_score(comet_data)
 
 
+def _audio_duration(audio_path: str) -> float:
+    """Return audio duration in seconds using mutagen (metadata-only, no decoding)."""
+    from mutagen import File
+    return File(audio_path).info.length
+
+
+def score_achap(
+        hypo_dict: Dict[str, str],
+        ref_dict: Dict[str, Dict[str, ReferenceSample]],
+        lang: str) -> Dict[str, float]:
+    """
+    Computes chunkseg metrics for audio chaptering (ACHAP): collar-based F1, time-chunk F1, WER, and title evaluation.
+
+    Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives
+    boundary timestamps and title time associations via forced alignment internally.
+
+    Reference XML format:
+      <reference>: JSON [[title, start_seconds], ...]
+      <metadata><audio_path>: path to audio file
+      <metadata><transcript>: reference transcript text (optional; enables WER)
+    """
+    import json
+    from chunkseg import evaluate_batch
+
+    chunkseg_lang = _CHUNKSEG_LANG.get(lang, "eng")
+    samples = []
+    has_transcript = False
+
+    for iid, ref_sample in ref_dict["ACHAP"].items():
+        assert len(ref_sample.sample_ids) == 1, \
+            f"ACHAP reference (IID: {iid}) mapped to multiple sample IDs: " \
+            f"{ref_sample.sample_ids}"
+        hypo_text = hypo_dict[ref_sample.sample_ids[0]]
+        ref_chapters = json.loads(ref_sample.reference)  # [[title, start_sec], ...]
+        ref_titles = [(t, float(s)) for t, s in ref_chapters]
+        ref_boundaries = [float(s) for _, s in ref_chapters]
+        audio_path = ref_sample.metadata["audio_path"]
+        duration = _audio_duration(audio_path)
+        transcript = ref_sample.metadata.get("transcript")
+        if transcript is not None:
+            has_transcript = True
+
+        sample = {
+            "hypothesis": hypo_text,
+            "reference": ref_boundaries,
+            "duration": duration,
+            "audio": audio_path,
+            "reference_titles": ref_titles,
+        }
+        if transcript is not None:
+            sample["reference_transcript"] = transcript
+        samples.append(sample)
+
+    if not samples:
+        return {}
+
+    results = evaluate_batch(
+        samples,
+        format="markdown",
+        lang=chunkseg_lang,
+        titles=True,
+        wer=has_transcript,
+        collar=3.0,
+        tolerance=5.0,
+    )
+
+    def _mean(key):
+        return results.get(key, {}).get("mean", 0.0)
+
+    out = {
+        "collar_f1":  _mean("collar_f1"),
+        "tm_bs_f1":   _mean("tm_bs_f1"),
+        "gc_bs_f1":   _mean("gc_bs_f1"),
+        "tm_matched": _mean("tm_matched"),
+    }
+    if has_transcript:
+        out["wer"] = _mean("wer")
+    return out
+
+
 def main(
         hypo_path: Path,
         ref_path: Path,
@@ -310,7 +393,6 @@ def main(
             assert "TRANS" in ref.keys()
             scores["TRANS-COMET"] = score_st(hypo, ref, lang)
     else:
-        assert len(ref.keys()) == 3 or len(ref.keys()) == 2
         assert "SUM" in ref.keys()
         scores["SUM-BERTScore"] = score_ssum(hypo, ref, lang)
         if lang == "en":
@@ -319,6 +401,14 @@ def main(
         else:
             assert "TRANS" in ref.keys()
             scores["TRANS-COMET"] = score_st(hypo, ref, lang)
+        if "ACHAP" in ref.keys():
+            achap = score_achap(hypo, ref, lang)
+            scores["ACHAP-CollarF1"]   = achap.get("collar_f1", 0.0)
+            scores["ACHAP-TM-BS"]      = achap.get("tm_bs_f1", 0.0)
+            scores["ACHAP-GC-BS"]      = achap.get("gc_bs_f1", 0.0)
+            scores["ACHAP-TM-MATCHED"] = achap.get("tm_matched", 0.0)
+            if "wer" in achap:
+                scores["ACHAP-WER"] = achap["wer"]
     return scores
 
 

From 8c69149c9bf8ab0acd49f46d8d14f241524b55ae Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Mon, 23 Feb 2026 15:43:45 +0100
Subject: [PATCH 02/16] Improve clarity and documentation

---
 src/mcif/evaluation.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 8cf3e48..a5e07b7 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -47,7 +47,6 @@
 # ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment
 _CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"}
 
-
 @dataclass
 class ReferenceSample:
     sample_ids: List[str]
@@ -295,11 +294,19 @@ def score_achap(
         ref_dict: Dict[str, Dict[str, ReferenceSample]],
         lang: str) -> Dict[str, float]:
     """
-    Computes chunkseg metrics for audio chaptering (ACHAP): collar-based F1, time-chunk F1, WER, and title evaluation.
+    Computes chunkseg metrics for audio chaptering (ACHAP):
+    - Collar-based F1 (±3s collar): Comparing predicted chapter timestamps with reference timestamps with tolerance
+    - BERTScore for titles, with two different strategies
+        - Global Concatenation; concatenated predicted vs reference titles
+        - Temporally Matched; only comparing titles of predicted sections temporally matching reference sections
+    - WER: word error rate, for the transcript generated alongside (optional)
 
     Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives
     boundary timestamps and title time associations via forced alignment internally.
 
+    Following the work of:
+    `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering" <https://www.arxiv.org/abs/2602.08979>`_
+
     Reference XML format:
       <reference>: JSON [[title, start_seconds], ...]
       <metadata><audio_path>: path to audio file
@@ -404,8 +411,8 @@ def main(
         if "ACHAP" in ref.keys():
             achap = score_achap(hypo, ref, lang)
             scores["ACHAP-CollarF1"]   = achap.get("collar_f1", 0.0)
-            scores["ACHAP-TM-BS"]      = achap.get("tm_bs_f1", 0.0)
-            scores["ACHAP-GC-BS"]      = achap.get("gc_bs_f1", 0.0)
+            scores["ACHAP-TM-BERTScore"]      = achap.get("tm_bs_f1", 0.0)
+            scores["ACHAP-GC-BERTScore"]      = achap.get("gc_bs_f1", 0.0)
             scores["ACHAP-TM-MATCHED"] = achap.get("tm_matched", 0.0)
             if "wer" in achap:
                 scores["ACHAP-WER"] = achap["wer"]

From 0424bd6d10d918ec53aa6da9942c7785448ff423 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Tue, 24 Feb 2026 12:03:33 +0100
Subject: [PATCH 03/16] More concise, fix linting issues

---
 src/mcif/evaluation.py | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index a5e07b7..e0f2bee 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -26,6 +26,7 @@
 
 import bert_score
 import jiwer
+from chunkseg import evaluate_batch
 from comet import download_model, load_from_checkpoint
 from whisper_normalizer import english, basic
 
@@ -295,11 +296,11 @@ def score_achap(
         lang: str) -> Dict[str, float]:
     """
     Computes chunkseg metrics for audio chaptering (ACHAP):
-    - Collar-based F1 (±3s collar): Comparing predicted chapter timestamps with reference timestamps with tolerance
-    - BERTScore for titles, with two different strategies
-        - Global Concatenation; concatenated predicted vs reference titles
-        - Temporally Matched; only comparing titles of predicted sections temporally matching reference sections
-    - WER: word error rate, for the transcript generated alongside (optional)
+    - Collar-based F1 (±3s collar): predicted vs reference timestamps with tolerance
+    - BERTScore for titles, with two different strategies:
+        - Global Concatenation: concatenated predicted vs reference titles
+        - Temporally Matched: titles of predicted sections matching reference sections
+    - WER: word error rate for the transcript generated alongside (optional)
 
     Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives
     boundary timestamps and title time associations via forced alignment internally.
@@ -312,9 +313,6 @@ def score_achap(
       <metadata><audio_path>: path to audio file
       <metadata><transcript>: reference transcript text (optional; enables WER)
     """
-    import json
-    from chunkseg import evaluate_batch
-
     chunkseg_lang = _CHUNKSEG_LANG.get(lang, "eng")
     samples = []
     has_transcript = False
@@ -357,17 +355,14 @@ def score_achap(
         tolerance=5.0,
     )
 
-    def _mean(key):
-        return results.get(key, {}).get("mean", 0.0)
-
     out = {
-        "collar_f1":  _mean("collar_f1"),
-        "tm_bs_f1":   _mean("tm_bs_f1"),
-        "gc_bs_f1":   _mean("gc_bs_f1"),
-        "tm_matched": _mean("tm_matched"),
+        "ACHAP-CollarF1": results["collar_f1"]["mean"],
+        "ACHAP-TM-BERTScore": results["tm_bs_f1"]["mean"],
+        "ACHAP-GC-BERTScore": results["gc_bs_f1"]["mean"],
+        "ACHAP-TM-MATCHED": results["tm_matched"]["mean"],
     }
     if has_transcript:
-        out["wer"] = _mean("wer")
+        out["ACHAP-WER"] = results["wer"]["mean"]
     return out
 
 
@@ -409,13 +404,7 @@ def main(
             assert "TRANS" in ref.keys()
             scores["TRANS-COMET"] = score_st(hypo, ref, lang)
         if "ACHAP" in ref.keys():
-            achap = score_achap(hypo, ref, lang)
-            scores["ACHAP-CollarF1"]   = achap.get("collar_f1", 0.0)
-            scores["ACHAP-TM-BERTScore"]      = achap.get("tm_bs_f1", 0.0)
-            scores["ACHAP-GC-BERTScore"]      = achap.get("gc_bs_f1", 0.0)
-            scores["ACHAP-TM-MATCHED"] = achap.get("tm_matched", 0.0)
-            if "wer" in achap:
-                scores["ACHAP-WER"] = achap["wer"]
+            scores.update(score_achap(hypo, ref, lang))
     return scores
 
 

From 36845cdc332d0f0ee666ab6502eea31f6a2e2671 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Tue, 24 Feb 2026 12:32:03 +0100
Subject: [PATCH 04/16] Fix remaining linting issues

---
 src/mcif/evaluation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index e0f2bee..ab7b736 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -48,6 +48,7 @@
 # ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment
 _CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"}
 
+
 @dataclass
 class ReferenceSample:
     sample_ids: List[str]
@@ -306,7 +307,8 @@ def score_achap(
     boundary timestamps and title time associations via forced alignment internally.
 
     Following the work of:
-    `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering" <https://www.arxiv.org/abs/2602.08979>`_
+    `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering"
+    <https://www.arxiv.org/abs/2602.08979>`_
 
     Reference XML format:
       <reference>: JSON [[title, start_seconds], ...]

From 3cf8154329e4ba6a92e4e40f485fa2121b6af638 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <f@retkow.ski>
Date: Wed, 11 Mar 2026 22:15:06 +0100
Subject: [PATCH 05/16] chunkseg_lang: no default, direct indexing

---
 src/mcif/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index ab7b736..29acb84 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -315,7 +315,7 @@ def score_achap(
       <metadata><audio_path>: path to audio file
       <metadata><transcript>: reference transcript text (optional; enables WER)
     """
-    chunkseg_lang = _CHUNKSEG_LANG.get(lang, "eng")
+    chunkseg_lang = _CHUNKSEG_LANG[lang]
     samples = []
     has_transcript = False
 

From 5e66bcedf1535d050594434ea446e0f7547cc942 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Fri, 13 Mar 2026 17:49:17 +0100
Subject: [PATCH 06/16] Inject audio path from source into reference metadata

---
 src/mcif/evaluation.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 29acb84..487c9e2 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -1,3 +1,4 @@
+
 # Copyright 2025 FBK, KIT
 
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -141,8 +142,17 @@ def read_reference(
         ref_path: Path,
         track: str,
         language: str,
-        modality: Optional[str] = None) -> Dict[str, Dict[str, ReferenceSample]]:
+        modality: Optional[str] = None,
+        hypo_path: Optional[Path] = None) -> Dict[str, Dict[str, ReferenceSample]]:
     xml = ET.parse(ref_path)
+    audio_paths = {}
+    if hypo_path is not None:
+        hypo_xml = ET.parse(hypo_path)
+        for task in hypo_xml.getroot().iter("task"):
+            if task.attrib['track'] == track and task.attrib['text_lang'] == language:
+                for s in task.iter("sample"):
+                    audio_paths[s.attrib['id']] = s.find('audio_path').text
+                break
     avail_tasks = []
     for task in xml.getroot().iter("task"):
         if task.attrib['track'] == track and task.attrib['text_lang'] == language:
@@ -160,6 +170,9 @@ def read_reference(
                     for field in ['qa_type', 'qa_origin']:
                         if field in sample.attrib:
                             sample_metadata[field] = sample.attrib[field]
+                    for sid in sample_ids:
+                        if sid in audio_paths:
+                            sample_metadata['audio_path'] = audio_paths[sid]
                     samples_by_subtask[sample.attrib['task']][sample.attrib['iid']] = \
                         ReferenceSample(sample_ids, sample_reference, sample_metadata)
             return samples_by_subtask
@@ -312,7 +325,6 @@ def score_achap(
 
     Reference XML format:
       <reference>: JSON [[title, start_seconds], ...]
-      <metadata><audio_path>: path to audio file
       <metadata><transcript>: reference transcript text (optional; enables WER)
     """
     chunkseg_lang = _CHUNKSEG_LANG[lang]
@@ -379,7 +391,8 @@ def main(
     Main function computing all the scores and returning a Dictionary with the scores
     """
     hypo = read_hypo(hypo_path, track, lang)
-    ref = read_reference(ref_path, track, lang, modality=filter_modality)
+    ref = read_reference(ref_path, track, lang, modality=filter_modality,
+                         hypo_path=hypo_path)
     scores = {}
     assert "QA" in ref.keys()
     scores["QA-BERTScore"], qa_types_scores = score_sqa(hypo, ref, lang, breakdown_qa_types)

From 42cfbdfbceb1f5c59ec706cc6c70883d4d45ff18 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Fri, 13 Mar 2026 19:33:29 +0100
Subject: [PATCH 07/16] Assume audio_path in reference file

---
 src/mcif/evaluation.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 487c9e2..b819715 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -1,4 +1,3 @@
-
 # Copyright 2025 FBK, KIT
 
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -142,17 +141,8 @@ def read_reference(
         ref_path: Path,
         track: str,
         language: str,
-        modality: Optional[str] = None,
-        hypo_path: Optional[Path] = None) -> Dict[str, Dict[str, ReferenceSample]]:
+        modality: Optional[str] = None) -> Dict[str, Dict[str, ReferenceSample]]:
     xml = ET.parse(ref_path)
-    audio_paths = {}
-    if hypo_path is not None:
-        hypo_xml = ET.parse(hypo_path)
-        for task in hypo_xml.getroot().iter("task"):
-            if task.attrib['track'] == track and task.attrib['text_lang'] == language:
-                for s in task.iter("sample"):
-                    audio_paths[s.attrib['id']] = s.find('audio_path').text
-                break
     avail_tasks = []
     for task in xml.getroot().iter("task"):
         if task.attrib['track'] == track and task.attrib['text_lang'] == language:
@@ -163,16 +153,13 @@ def read_reference(
                         samples_by_subtask[sample.attrib['task']] = {}
                     sample_ids = sample.attrib['id'].split(",")
                     sample_reference = next(sample.iter('reference')).text
-                    sample_metadata = {}
+                    sample_metadata = {'audio_path': next(sample.iter('audio_path')).text}
                     for metadata in sample.iter('metadata'):
                         for metadata_field in metadata.iter():
                             sample_metadata[metadata_field.tag] = metadata_field.text
                     for field in ['qa_type', 'qa_origin']:
                         if field in sample.attrib:
                             sample_metadata[field] = sample.attrib[field]
-                    for sid in sample_ids:
-                        if sid in audio_paths:
-                            sample_metadata['audio_path'] = audio_paths[sid]
                     samples_by_subtask[sample.attrib['task']][sample.attrib['iid']] = \
                         ReferenceSample(sample_ids, sample_reference, sample_metadata)
             return samples_by_subtask
@@ -391,8 +378,7 @@ def main(
     Main function computing all the scores and returning a Dictionary with the scores
     """
     hypo = read_hypo(hypo_path, track, lang)
-    ref = read_reference(ref_path, track, lang, modality=filter_modality,
-                         hypo_path=hypo_path)
+    ref = read_reference(ref_path, track, lang, modality=filter_modality)
     scores = {}
     assert "QA" in ref.keys()
     scores["QA-BERTScore"], qa_types_scores = score_sqa(hypo, ref, lang, breakdown_qa_types)

From e321f3adcdc9721503601a94308eaf5aaf01d50f Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <f@retkow.ski>
Date: Mon, 16 Mar 2026 18:04:20 +0100
Subject: [PATCH 08/16] Fix LookupError by updating chunkseg dependency version
 to 0.3.2 to

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 95afcc9..0487032 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "unbabel-comet==2.2.4",
     "whisper_normalizer==0.0.10",
     "mutagen>=1.47",
-    "chunkseg[align,titles]==0.3.1"
+    "chunkseg[align,titles]==0.3.2"
 ]
 
 dynamic = ["version"]

From 244e06b6f457ecf603e4f0ea64a1052ca45ced94 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Wed, 18 Mar 2026 18:49:50 +0100
Subject: [PATCH 09/16] Use relative path instead of absolute path, chunkseg
 version bump

---
 pyproject.toml         | 2 +-
 src/mcif/evaluation.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 95afcc9..b176f55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "unbabel-comet==2.2.4",
     "whisper_normalizer==0.0.10",
     "mutagen>=1.47",
-    "chunkseg[align,titles]==0.3.1"
+    "chunkseg[align,titles]==0.3.3"
 ]
 
 dynamic = ["version"]
diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index b819715..472827e 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -292,6 +292,7 @@ def _audio_duration(audio_path: str) -> float:
 
 
 def score_achap(
+        base_ref_path: Path,
         hypo_dict: Dict[str, str],
         ref_dict: Dict[str, Dict[str, ReferenceSample]],
         lang: str) -> Dict[str, float]:
@@ -326,8 +327,8 @@ def score_achap(
         ref_chapters = json.loads(ref_sample.reference)  # [[title, start_sec], ...]
         ref_titles = [(t, float(s)) for t, s in ref_chapters]
         ref_boundaries = [float(s) for _, s in ref_chapters]
-        audio_path = ref_sample.metadata["audio_path"]
-        duration = _audio_duration(audio_path)
+        audio_path = base_ref_path / "LONG_AUDIOS" / ref_sample.metadata["audio_path"]
+        duration = _audio_duration(audio_path.absolute().as_posix())
         transcript = ref_sample.metadata.get("transcript")
         if transcript is not None:
             has_transcript = True
@@ -336,7 +337,7 @@ def score_achap(
             "hypothesis": hypo_text,
             "reference": ref_boundaries,
             "duration": duration,
-            "audio": audio_path,
+            "audio": audio_path.absolute().as_posix(),
             "reference_titles": ref_titles,
         }
         if transcript is not None:
@@ -405,7 +406,7 @@ def main(
             assert "TRANS" in ref.keys()
             scores["TRANS-COMET"] = score_st(hypo, ref, lang)
         if "ACHAP" in ref.keys():
-            scores.update(score_achap(hypo, ref, lang))
+            scores.update(score_achap(ref_path.parent, hypo, ref, lang))
     return scores
 
 

From 5edab8defd4b325e09fd9a0a9f615a423ab646c6 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Thu, 19 Mar 2026 18:55:45 +0100
Subject: [PATCH 10/16] Handle crosslingual evaluation by aligning hypothesis
 translation with reference transcript and using the latter for forced
 alignment

---
 pyproject.toml         |  2 +-
 src/mcif/evaluation.py | 74 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b176f55..b40193e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "unbabel-comet==2.2.4",
     "whisper_normalizer==0.0.10",
     "mutagen>=1.47",
-    "chunkseg[align,titles]==0.3.3"
+    "chunkseg[align,titles]==0.3.4"
 ]
 
 dynamic = ["version"]
diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 472827e..ea787d4 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -27,6 +27,7 @@
 import bert_score
 import jiwer
 from chunkseg import evaluate_batch
+from chunkseg.parsers import parse_transcript
 from comet import download_model, load_from_checkpoint
 from whisper_normalizer import english, basic
 
@@ -291,6 +292,56 @@ def _audio_duration(audio_path: str) -> float:
     return File(audio_path).info.length
 
 
+def _replace_translation_with_transcript(
+        hypo_text: str,
+        gold_translation: str,
+        ref_transcript: str,
+        target_lang: str) -> str:
+    """Replace translated hypothesis body with English transcript via mwerSegmenter."""
+    parsed = parse_transcript(hypo_text, "markdown")
+    titles = parsed.titles or []
+    sections = parsed.sections or []
+
+    if not titles or not sections:
+        return hypo_text
+
+    section_texts = [" ".join(sents) for sents in sections]
+    full_hyp = " ".join(section_texts)
+
+    gold_lines = [l for l in gold_translation.strip().split("\n") if l.strip()]
+    ref_lines = [l for l in ref_transcript.strip().split("\n") if l.strip()]
+    assert len(gold_lines) == len(ref_lines), \
+        f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \
+        f"line counts differ"
+
+    segmenter = MwerSegmenter(character_level=(target_lang in CHAR_LEVEL_LANGS))
+    reseg = segmenter(full_hyp, gold_lines)
+
+    section_ends, pos = [], 0
+    for t in section_texts:
+        pos += len(t)
+        section_ends.append(pos)
+        pos += 1
+
+    section_ref: List[List[str]] = [[] for _ in titles]
+    hyp_pos, sec_idx = 0, 0
+    for i, seg in enumerate(reseg):
+        seg = seg.strip()
+        if not seg:
+            continue
+        found = full_hyp.find(seg, hyp_pos)
+        mid = found + len(seg) // 2 if found >= 0 else hyp_pos
+        if found >= 0:
+            hyp_pos = found + len(seg)
+        while sec_idx < len(section_ends) - 1 and mid >= section_ends[sec_idx]:
+            sec_idx += 1
+        section_ref[sec_idx].append(ref_lines[i])
+
+    return "\n".join(
+        f"# {t}\n{' '.join(r)}\n" for t, r in zip(titles, section_ref)
+    ).strip()
+
+
 def score_achap(
         base_ref_path: Path,
         hypo_dict: Dict[str, str],
@@ -307,15 +358,21 @@ def score_achap(
     Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives
     boundary timestamps and title time associations via forced alignment internally.
 
+    For crosslingual evaluation, the translated hypothesis is
+    aligned to the gold translation via mwerSegmenter and replaced with the
+    reference transcript before passing to chunkseg.
+
     Following the work of:
     `"Beyond Transcripts: A Renewed Perspective on Audio Chaptering"
     <https://www.arxiv.org/abs/2602.08979>`_
 
     Reference XML format:
       <reference>: JSON [[title, start_seconds], ...]
-      <metadata><transcript>: reference transcript text (optional; enables WER)
+      <metadata><transcript>: English reference transcript (optional; enables WER)
+      <metadata><translation>: gold translation, line-aligned with transcript
+          (crosslingual only)
     """
-    chunkseg_lang = _CHUNKSEG_LANG[lang]
+    crosslingual = (lang != "en")
     samples = []
     has_transcript = False
 
@@ -330,7 +387,13 @@ def score_achap(
         audio_path = base_ref_path / "LONG_AUDIOS" / ref_sample.metadata["audio_path"]
         duration = _audio_duration(audio_path.absolute().as_posix())
         transcript = ref_sample.metadata.get("transcript")
-        if transcript is not None:
+        translation = ref_sample.metadata.get("translation")
+
+        if crosslingual and translation is not None and transcript is not None:
+            hypo_text = _replace_translation_with_transcript(
+                hypo_text, translation, transcript, lang)
+
+        if transcript is not None and not crosslingual:
             has_transcript = True
 
         sample = {
@@ -340,7 +403,7 @@ def score_achap(
             "audio": audio_path.absolute().as_posix(),
             "reference_titles": ref_titles,
         }
-        if transcript is not None:
+        if has_transcript:
             sample["reference_transcript"] = transcript
         samples.append(sample)
 
@@ -350,7 +413,8 @@ def score_achap(
     results = evaluate_batch(
         samples,
         format="markdown",
-        lang=chunkseg_lang,
+        src_lang="eng",
+        tgt_lang=lang,
         titles=True,
         wer=has_transcript,
         collar=3.0,

From f4f0b284b7da457b54a28e308b39beb178975340 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Thu, 19 Mar 2026 19:08:07 +0100
Subject: [PATCH 11/16] Fix linting issue

---
 src/mcif/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index ea787d4..8c095fe 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -308,8 +308,8 @@ def _replace_translation_with_transcript(
     section_texts = [" ".join(sents) for sents in sections]
     full_hyp = " ".join(section_texts)
 
-    gold_lines = [l for l in gold_translation.strip().split("\n") if l.strip()]
-    ref_lines = [l for l in ref_transcript.strip().split("\n") if l.strip()]
+    gold_lines = [s for s in gold_translation.strip().split("\n") if s.strip()]
+    ref_lines = [s for s in ref_transcript.strip().split("\n") if s.strip()]
     assert len(gold_lines) == len(ref_lines), \
         f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \
         f"line counts differ"

From bae4a9216e8f4da767bd75743e3e005ea0f3e7d7 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <f@retkow.ski>
Date: Fri, 20 Mar 2026 13:57:56 +0100
Subject: [PATCH 12/16] Update src/mcif/evaluation.py

Co-authored-by: sarapapi <57095209+sarapapi@users.noreply.github.com>
---
 src/mcif/evaluation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 8c095fe..b860b39 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -369,8 +369,7 @@ def score_achap(
     Reference XML format:
       <reference>: JSON [[title, start_seconds], ...]
       <metadata><transcript>: English reference transcript (optional; enables WER)
-      <metadata><translation>: gold translation, line-aligned with transcript
-          (crosslingual only)
+      <metadata><translation>: reference translation, line-aligned with transcript
     """
     crosslingual = (lang != "en")
     samples = []

From 985ce3dc98c3812daa0448f4f7f8b3c752ed10bd Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Fri, 20 Mar 2026 15:45:41 +0100
Subject: [PATCH 13/16] Always require transcript

---
 src/mcif/evaluation.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index b860b39..2b71d01 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -353,7 +353,7 @@ def score_achap(
     - BERTScore for titles, with two different strategies:
         - Global Concatenation: concatenated predicted vs reference titles
         - Temporally Matched: titles of predicted sections matching reference sections
-    - WER: word error rate for the transcript generated alongside (optional)
+    - WER: word error rate for the transcript generated alongside
 
     Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives
     boundary timestamps and title time associations via forced alignment internally.
@@ -368,12 +368,11 @@ def score_achap(
 
     Reference XML format:
       <reference>: JSON [[title, start_seconds], ...]
-      <metadata><transcript>: English reference transcript (optional; enables WER)
+      <metadata><transcript>: English reference transcript
       <metadata><translation>: reference translation, line-aligned with transcript
     """
     crosslingual = (lang != "en")
     samples = []
-    has_transcript = False
 
     for iid, ref_sample in ref_dict["ACHAP"].items():
         assert len(ref_sample.sample_ids) == 1, \
@@ -385,25 +384,21 @@ def score_achap(
         ref_boundaries = [float(s) for _, s in ref_chapters]
         audio_path = base_ref_path / "LONG_AUDIOS" / ref_sample.metadata["audio_path"]
         duration = _audio_duration(audio_path.absolute().as_posix())
-        transcript = ref_sample.metadata.get("transcript")
-        translation = ref_sample.metadata.get("translation")
+        transcript = ref_sample.metadata["transcript"]
 
-        if crosslingual and translation is not None and transcript is not None:
+        if crosslingual:
+            translation = ref_sample.metadata["translation"]
             hypo_text = _replace_translation_with_transcript(
                 hypo_text, translation, transcript, lang)
 
-        if transcript is not None and not crosslingual:
-            has_transcript = True
-
         sample = {
             "hypothesis": hypo_text,
             "reference": ref_boundaries,
             "duration": duration,
             "audio": audio_path.absolute().as_posix(),
             "reference_titles": ref_titles,
+            "reference_transcript": transcript,
         }
-        if has_transcript:
-            sample["reference_transcript"] = transcript
         samples.append(sample)
 
     if not samples:
@@ -415,7 +410,7 @@ def score_achap(
         src_lang="eng",
         tgt_lang=lang,
         titles=True,
-        wer=has_transcript,
+        wer=not crosslingual,
         collar=3.0,
         tolerance=5.0,
     )
@@ -426,7 +421,7 @@ def score_achap(
         "ACHAP-GC-BERTScore": results["gc_bs_f1"]["mean"],
         "ACHAP-TM-MATCHED": results["tm_matched"]["mean"],
     }
-    if has_transcript:
+    if not crosslingual:
         out["ACHAP-WER"] = results["wer"]["mean"]
     return out
 

From 93d8fc5bf37890949dcea5c3fc6c2d2a4ecff692 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Fri, 20 Mar 2026 15:59:27 +0100
Subject: [PATCH 14/16] Separate alignment into its own function

---
 src/mcif/evaluation.py | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 2b71d01..1397ee4 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -292,28 +292,21 @@ def _audio_duration(audio_path: str) -> float:
     return File(audio_path).info.length
 
 
-def _replace_translation_with_transcript(
+def _align_sections(
         hypo_text: str,
-        gold_translation: str,
-        ref_transcript: str,
-        target_lang: str) -> str:
-    """Replace translated hypothesis body with English transcript via mwerSegmenter."""
+        gold_lines: List[str],
+        target_lang: str) -> Tuple[List[str], List[List[int]]]:
+    """Align hypothesis sections to gold translation lines via mwerSegmenter."""
     parsed = parse_transcript(hypo_text, "markdown")
     titles = parsed.titles or []
     sections = parsed.sections or []
 
     if not titles or not sections:
-        return hypo_text
+        return titles, [[] for _ in titles]
 
     section_texts = [" ".join(sents) for sents in sections]
     full_hyp = " ".join(section_texts)
 
-    gold_lines = [s for s in gold_translation.strip().split("\n") if s.strip()]
-    ref_lines = [s for s in ref_transcript.strip().split("\n") if s.strip()]
-    assert len(gold_lines) == len(ref_lines), \
-        f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \
-        f"line counts differ"
-
     segmenter = MwerSegmenter(character_level=(target_lang in CHAR_LEVEL_LANGS))
     reseg = segmenter(full_hyp, gold_lines)
 
@@ -323,7 +316,7 @@ def _replace_translation_with_transcript(
         section_ends.append(pos)
         pos += 1
 
-    section_ref: List[List[str]] = [[] for _ in titles]
+    section_to_line_map: List[List[int]] = [[] for _ in titles]
     hyp_pos, sec_idx = 0, 0
     for i, seg in enumerate(reseg):
         seg = seg.strip()
@@ -335,7 +328,28 @@ def _replace_translation_with_transcript(
             hyp_pos = found + len(seg)
         while sec_idx < len(section_ends) - 1 and mid >= section_ends[sec_idx]:
             sec_idx += 1
-        section_ref[sec_idx].append(ref_lines[i])
+        section_to_line_map[sec_idx].append(i)
+
+    return titles, section_to_line_map
+
+
+def _replace_translation_with_transcript(
+        hypo_text: str,
+        gold_translation: str,
+        ref_transcript: str,
+        target_lang: str) -> str:
+    """Replace translated hypothesis body with reference transcript via mwerSegmenter."""
+    gold_lines = [s for s in gold_translation.strip().split("\n") if s.strip()]
+    ref_lines = [s for s in ref_transcript.strip().split("\n") if s.strip()]
+    assert len(gold_lines) == len(ref_lines), \
+        f"Gold translation ({len(gold_lines)}) and transcript ({len(ref_lines)}) " \
+        f"line counts differ"
+
+    titles, section_to_line_map = _align_sections(hypo_text, gold_lines, target_lang)
+    if not titles:
+        return hypo_text
+
+    section_ref = [[ref_lines[i] for i in indices] for indices in section_to_line_map]
 
     return "\n".join(
         f"# {t}\n{' '.join(r)}\n" for t, r in zip(titles, section_ref)

From 89d0bde005a98b72d2d2dcf065dee6450524b170 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <fabian.retkowski@kit.edu>
Date: Tue, 24 Mar 2026 13:22:33 +0100
Subject: [PATCH 15/16] Calculate ACHAP-COMET

---
 src/mcif/evaluation.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 1397ee4..84a70ea 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -367,7 +367,7 @@ def score_achap(
     - BERTScore for titles, with two different strategies:
         - Global Concatenation: concatenated predicted vs reference titles
         - Temporally Matched: titles of predicted sections matching reference sections
-    - WER: word error rate for the transcript generated alongside
+    - WER/COMET: quality measure for the transcript/translation generated alongside
 
     Hypothesis is a plain Markdown transcript (no timestamps); chunkseg derives
     boundary timestamps and title time associations via forced alignment internally.
@@ -387,6 +387,7 @@ def score_achap(
     """
     crosslingual = (lang != "en")
     samples = []
+    comet_data = []
 
     for iid, ref_sample in ref_dict["ACHAP"].items():
         assert len(ref_sample.sample_ids) == 1, \
@@ -405,6 +406,16 @@ def score_achap(
             hypo_text = _replace_translation_with_transcript(
                 hypo_text, translation, transcript, lang)
 
+            # Prepare COMET data
+            gold_lines = [s for s in translation.strip().split("\n") if s.strip()]
+            src_lines = [s for s in transcript.strip().split("\n") if s.strip()]
+            segmenter = MwerSegmenter(character_level=(lang in CHAR_LEVEL_LANGS))
+            parsed = parse_transcript(hypo_dict[ref_sample.sample_ids[0]], "markdown")
+            flat = " ".join(" ".join(s) for s in (parsed.sections or []))
+            reseg = segmenter(flat, gold_lines)
+            for mt, ref, src in zip(reseg, gold_lines, src_lines):
+                comet_data.append({"src": src.strip(), "mt": mt.strip(), "ref": ref.strip()})
+
         sample = {
             "hypothesis": hypo_text,
             "reference": ref_boundaries,
@@ -435,7 +446,9 @@ def score_achap(
         "ACHAP-GC-BERTScore": results["gc_bs_f1"]["mean"],
         "ACHAP-TM-MATCHED": results["tm_matched"]["mean"],
     }
-    if not crosslingual:
+    if crosslingual:
+        out["ACHAP-COMET"] = comet_score(comet_data)
+    else:
         out["ACHAP-WER"] = results["wer"]["mean"]
     return out
 

From ec53e8569d959998ee39934bb3fa40d70421a0f7 Mon Sep 17 00:00:00 2001
From: Fabian Retkowski <f@retkow.ski>
Date: Tue, 24 Mar 2026 16:57:08 +0100
Subject: [PATCH 16/16] Remove unused language mapping

---
 src/mcif/evaluation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mcif/evaluation.py b/src/mcif/evaluation.py
index 84a70ea..764e4a9 100644
--- a/src/mcif/evaluation.py
+++ b/src/mcif/evaluation.py
@@ -46,9 +46,6 @@
 
 CHAR_LEVEL_LANGS = {"zh"}
 
-# ISO 639-1 → ISO 639-3 mapping for chunkseg forced alignment
-_CHUNKSEG_LANG = {"en": "eng", "de": "deu", "it": "ita", "zh": "zho"}
-
 
 @dataclass
 class ReferenceSample: