From 808c392c0be4287f1fee1db21dc7128357042583 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:15:17 +0200
Subject: [PATCH 01/12] ci: use ubuntu-22 instead of latest to keep python37
 compatibility

---
 .github/workflows/documentation.yml | 2 +-
 .github/workflows/release.yml       | 8 ++++----
 .github/workflows/test-build.yml    | 4 ++--
 .github/workflows/tests.yml         | 8 ++++----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 06aac1ac1..d182c0ae0 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -13,7 +13,7 @@ env:
 
 jobs:
   Documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v2
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 65de04d5a..b6b9198d0 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-22.04, windows-latest, macos-latest]
 
     steps:
       - uses: actions/checkout@v4
@@ -42,7 +42,7 @@ jobs:
 
   build_sdist:
     name: Build source distribution
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
 
@@ -58,7 +58,7 @@ jobs:
     name: Upload to PyPI
 
     needs: [build_wheels, build_sdist]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/download-artifact@v4
@@ -76,7 +76,7 @@ jobs:
         # repository_url: https://test.pypi.org/legacy/
 
   Documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v3
 
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index 0b849728b..ac64a28b3 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-22.04, windows-latest, macos-latest]
 
     steps:
       - uses: actions/checkout@v2
@@ -30,7 +30,7 @@ jobs:
 
   build_sdist:
     name: Build source distribution
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 728547434..f55139d32 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,7 +15,7 @@ jobs:
   linting:
     name: Linting
     if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
         with:
@@ -32,7 +32,7 @@ jobs:
 
   pytest:
     name: Pytest
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       fail-fast: true
       matrix:
@@ -120,7 +120,7 @@ jobs:
 
   documentation:
     name: Documentation
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v2
 
@@ -150,7 +150,7 @@ jobs:
 
   simple-installation:
     name: Simple installation
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       fail-fast: true
       matrix:

From 543d5df5dbf1c3d3b44da53ce1fcea4e9bc6a9ca Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:16:45 +0200
Subject: [PATCH 02/12] feat: handle linebreak inside words and linebreak
 without leading whitespace

---
 edsnlp/language.py                            | 10 +++++++-
 .../core/normalizer/pollution/patterns.py     |  5 ++++
 edsnlp/utils/doc_to_text.py                   | 10 ++++++--
 tests/pipelines/core/test_normalisation.py    | 24 ++++++++++++++-----
 tests/test_language.py                        | 12 ++++++++++
 5 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/edsnlp/language.py b/edsnlp/language.py
index f61c804b3..8a6df7e9c 100644
--- a/edsnlp/language.py
+++ b/edsnlp/language.py
@@ -41,7 +41,15 @@ class EDSLanguage(French):
     Defaults = EDSDefaults
 
 
-TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))"]
+TOKENIZER_EXCEPTIONS = [
+    r"Dr\.",
+    r"Pr\.",
+    r"M\.",
+    r"Mme\.",
+    r"Mlle\.",
+    r"(?i:(?:ep\.))",
+    r"-\n",
+]
 
 
 class EDSTokenizer(Tokenizer):
diff --git a/edsnlp/pipes/core/normalizer/pollution/patterns.py b/edsnlp/pipes/core/normalizer/pollution/patterns.py
index ad5ae5100..915a81131 100644
--- a/edsnlp/pipes/core/normalizer/pollution/patterns.py
+++ b/edsnlp/pipes/core/normalizer/pollution/patterns.py
@@ -40,6 +40,9 @@
 footer = rf"(?i)({page}.*\n?pat.*(ipp)?.*\n?(courrier valid.*)?)"
 footer += rf"|(.*{date}.*{ipp}.*)|(imprim.\sle\s{date}.*\d/\d.*\n?pat.*{date})"
 
+# Word split in the middle due to line break
+intraword_split = r"-\n"
+
 pollution = dict(
     information=information,
     bars=bars,
@@ -48,6 +51,7 @@
     web=web,
     coding=coding,
     footer=footer,
+    intraword_split=intraword_split,
 )
 
 default_enabled = dict(
@@ -58,4 +62,5 @@
     web=True,
     coding=False,
     footer=True,
+    intraword_split=True,
 )
diff --git a/edsnlp/utils/doc_to_text.py b/edsnlp/utils/doc_to_text.py
index b9ea7043e..2a525e94a 100644
--- a/edsnlp/utils/doc_to_text.py
+++ b/edsnlp/utils/doc_to_text.py
@@ -82,8 +82,8 @@ def aggregate_tokens(
         else:
             keep_list = [True] * len(arr)
 
-        for i, (str_hash, space, keep) in enumerate(
-            zip(tokens_text, tokens_space, keep_list)
+        for i, (str_hash, tag_hash, space, keep) in enumerate(
+            zip(tokens_text, tokens_tag, tokens_space, keep_list)
         ):
             if keep:
                 if space:
@@ -99,6 +99,12 @@ def aggregate_tokens(
                     offset += len(part)
                     ends[i] = offset
             else:
+                if i > 0 and tag_hash == space_hash:
+                    if text_parts[i - 1][-1:] and (
+                        text_parts[i - 1][-1:] not in (" ", "\n")
+                    ):
+                        text_parts[i - 1] += " "
+                        offset += 1
                 begins[i] = offset
                 ends[i] = offset
 
diff --git a/tests/pipelines/core/test_normalisation.py b/tests/pipelines/core/test_normalisation.py
index 4cfa7b039..3628145bb 100644
--- a/tests/pipelines/core/test_normalisation.py
+++ b/tests/pipelines/core/test_normalisation.py
@@ -1,3 +1,4 @@
+import spacy
 from pytest import fixture
 
 from edsnlp.matchers.utils import get_text
@@ -25,7 +26,6 @@ def test_full_normalization(doc):
 @fixture
 def nlp_factory(blank_nlp):
     def f(a=False, lc=False, q=False, p=False):
-
         if a:
             a = dict(accents=accents)
         if q:
@@ -48,7 +48,6 @@ def f(a=False, lc=False, q=False, p=False):
 
 
 def test_normalization_accents(nlp_factory, text):
-
     nlp = nlp_factory(a=True)
     doc = nlp(text)
 
@@ -58,7 +57,6 @@ def test_normalization_accents(nlp_factory, text):
 
 
 def test_normalization_spaces(nlp_factory, text):
-
     nlp = nlp_factory(a=True)
     doc = nlp("Phrase    avec des espaces \n et un retour à la ligne")
 
@@ -67,7 +65,6 @@ def test_normalization_spaces(nlp_factory, text):
 
 
 def test_normalization_quotes(nlp_factory, text):
-
     nlp = nlp_factory(q=True)
     doc = nlp(text)
 
@@ -79,7 +76,6 @@ def test_normalization_quotes(nlp_factory, text):
 
 
 def test_normalization_lowercase(nlp_factory, text):
-
     nlp = nlp_factory(lc=True)
     doc = nlp(text)
 
@@ -88,8 +84,24 @@ def test_normalization_lowercase(nlp_factory, text):
     assert norm.startswith("l'aïeul")
 
 
-def test_normalization_pollution(nlp_factory, text):
+def test_normalization_pollution_with_eds_lang():
+    nlp = spacy.blank("eds")
+    nlp.add_pipe("eds.normalizer")
+    text = "Il faut soigner ce diab-\nete"
+    doc = nlp(text)
+    norm = get_text(doc, attr="NORM", ignore_excluded=True)
+    assert norm == "il faut soigner ce diabete"
+
+
+def test_normalization_linebreak_no_space(nlp_factory):
+    nlp = nlp_factory()
+    text = "Mode de vie: \nTabac\nAlcool\nPas de sport"
+    doc = nlp(text)
+    norm = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True)
+    assert norm == "Mode de vie: Tabac Alcool Pas de sport"
+
 
+def test_normalization_pollution(nlp_factory, text):
     nlp = nlp_factory(p=True)
     doc = nlp(text)
 
diff --git a/tests/test_language.py b/tests/test_language.py
index 369c6df12..363a7f117 100644
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -58,6 +58,18 @@ def test_eds_tokenizer_whitespace():
     ]
 
 
+def test_eds_tokenizer_intraword_split():
+    nlp = spacy.blank("eds")
+    tokenized = [(w.text, w.whitespace_) for w in nlp("Un dia-\nbete ici")]
+    assert tokenized == [
+        ("Un", " "),
+        ("dia", ""),
+        ("-\n", ""),
+        ("bete", " "),
+        ("ici", ""),
+    ]
+
+
 def test_eds_tokenizer_numbers():
     nlp = spacy.blank("eds")
     tokenized = [(w.text, w.whitespace_) for w in nlp("Il fait 5.3/5.4mm")]

From 698a8dfe7bd113f30d39434bd73a4641117f1a33 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:17:19 +0200
Subject: [PATCH 03/12] fix: handle case where status is None in
 behavior/disorder pipes

---
 edsnlp/pipes/ner/disorders/base.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/edsnlp/pipes/ner/disorders/base.py b/edsnlp/pipes/ner/disorders/base.py
index 06d25358d..54855dbea 100644
--- a/edsnlp/pipes/ner/disorders/base.py
+++ b/edsnlp/pipes/ner/disorders/base.py
@@ -113,8 +113,17 @@ def __call__(self, doc: Doc) -> Doc:
             annotated spaCy Doc object
         """
         spans = list(self.process(doc))
+        all_detailed_status = set(self.detailed_status_mapping.keys())
         for span in spans:
-            span._.detailed_status = self.detailed_status_mapping[span._.status]
+            if span._.status is not None and span._.status not in all_detailed_status:
+                raise ValueError(
+                    f"Got incorrect status value for '{span}'. Expected "
+                    f"None or one of {all_detailed_status}, got {span._.status}"
+                )
+            span._.detailed_status = self.detailed_status_mapping.get(
+                span._.status,
+                None,
+            )
 
         self.set_spans(doc, filter_spans(spans))
 

From 827f4bef31de9f9e8660f55fd6c1700b3cb5a833 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:18:17 +0200
Subject: [PATCH 04/12] fix: treat span as doc in Qualifier process method

---
 edsnlp/pipes/qualifiers/base.py                           | 8 ++++++--
 edsnlp/pipes/qualifiers/family/family.py                  | 3 ++-
 edsnlp/pipes/qualifiers/history/history.py                | 3 ++-
 edsnlp/pipes/qualifiers/hypothesis/hypothesis.py          | 3 ++-
 edsnlp/pipes/qualifiers/negation/negation.py              | 3 ++-
 .../pipes/qualifiers/reported_speech/reported_speech.py   | 3 ++-
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/edsnlp/pipes/qualifiers/base.py b/edsnlp/pipes/qualifiers/base.py
index 521baf0e6..0b41fac9e 100644
--- a/edsnlp/pipes/qualifiers/base.py
+++ b/edsnlp/pipes/qualifiers/base.py
@@ -172,9 +172,13 @@ def get_matches(self, doc: Doc) -> List[Span]:
 
         return list(matches)
 
-    def process(self, doc: Doc) -> BaseQualifierResults:
+    def ensure_doc(self, doc: Union[Doc, Span]) -> Doc:
+        return doc if not hasattr(doc, "as_doc") else doc.as_doc()
+
+    def process(self, doc_like: Union[Doc, Span]) -> BaseQualifierResults:
+        doc_like = self.ensure_doc(doc_like)  # pragma: no cover
         raise NotImplementedError
 
     def __call__(self, doc: Doc) -> Doc:
-        results = self.process(doc)
+        results = self.process(doc)  # pragma: no cover
         raise NotImplementedError(f"{type(results)} should be used to tag the document")
diff --git a/edsnlp/pipes/qualifiers/family/family.py b/edsnlp/pipes/qualifiers/family/family.py
index e979e71de..12943d9eb 100644
--- a/edsnlp/pipes/qualifiers/family/family.py
+++ b/edsnlp/pipes/qualifiers/family/family.py
@@ -187,7 +187,8 @@ def set_extensions(self) -> None:
         if not Doc.has_extension("family"):
             Doc.set_extension("family", default=[])
 
-    def process(self, doc: Doc) -> FamilyResults:
+    def process(self, doc_like: Union[Doc, Span]) -> FamilyResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
 
         terminations = [m for m in matches if m.label_ == "termination"]
diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py
index 2dc8d56d8..ab5469c95 100644
--- a/edsnlp/pipes/qualifiers/history/history.py
+++ b/edsnlp/pipes/qualifiers/history/history.py
@@ -326,7 +326,8 @@ def set_extensions(self) -> None:
                 getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
             )
 
-    def process(self, doc: Doc) -> HistoryResults:
+    def process(self, doc_like: Union[Doc, Span]) -> HistoryResults:
+        doc = self.ensure_doc(doc_like)
         note_datetime = None
         if doc._.note_datetime is not None:
             try:
diff --git a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py
index 924d2cf63..68ed1d1d1 100644
--- a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py
+++ b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py
@@ -262,7 +262,8 @@ def load_verbs(
             list_hypo_verbs_following,
         )
 
-    def process(self, doc: Doc) -> HypothesisResults:
+    def process(self, doc_like: Union[Doc, Span]) -> HypothesisResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
 
         terminations = [m for m in matches if m.label_ == "termination"]
diff --git a/edsnlp/pipes/qualifiers/negation/negation.py b/edsnlp/pipes/qualifiers/negation/negation.py
index fb2c7878f..ea4a4fc40 100644
--- a/edsnlp/pipes/qualifiers/negation/negation.py
+++ b/edsnlp/pipes/qualifiers/negation/negation.py
@@ -295,7 +295,8 @@ def __call__(self, doc: Doc) -> Doc:
                     token._.negation = True
         return doc
 
-    def process(self, doc: Doc) -> NegationResults:
+    def process(self, doc_like: Union[Doc, Span]) -> NegationResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
 
         terminations = [m for m in matches if m.label_ == "termination"]
diff --git a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py
index 77b0cbe91..759eb7091 100644
--- a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py
+++ b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py
@@ -226,7 +226,8 @@ def load_verbs(self, verbs: List[str]) -> List[str]:
 
         return list_rep_verbs
 
-    def process(self, doc: Doc) -> ReportedSpeechResults:
+    def process(self, doc_like: Union[Doc, Span]) -> ReportedSpeechResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
         matches += list(self.regex_matcher(doc, as_spans=True))
 

From 5d790d21ccbd236eee0f0653a0d633fd045ccc43 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:19:10 +0200
Subject: [PATCH 05/12] fix: small bug in alcohol and tobacco pipes

---
 edsnlp/pipes/ner/behaviors/alcohol/alcohol.py |  6 ++---
 .../pipes/ner/behaviors/alcohol/patterns.py   |  2 +-
 tests/pipelines/ner/disorders/alcohol.py      | 22 ++++++++++++++++---
 tests/pipelines/ner/disorders/test_all.py     |  5 ++++-
 tests/pipelines/ner/disorders/tobacco.py      | 10 ++++++---
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py
index ca543fa6e..c1bbc9177 100644
--- a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py
+++ b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py
@@ -118,8 +118,8 @@ def process(self, doc: Doc) -> List[Span]:
                 stopped = self.negation.process(span)
                 if not any(stopped_token.negation for stopped_token in stopped.tokens):
                     span._.status = 2
-
-            if "zero_after" in span._.assigned.keys():
-                span._.negation = True
+            else:
+                if "zero_after" in span._.assigned.keys():
+                    span._.negation = True
 
             yield span
diff --git a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py
index 38c795926..7777225d3 100644
--- a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py
+++ b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py
@@ -3,7 +3,7 @@
     regex=[
         r"\balco[ol]",
         r"\bethyl",
-        r"(?<!(25.?)|(sevrage)).?\boh\b",
+        r"(?<!(25.{0,10}))\boh\b",
         r"exogenose",
         r"delirium.tremens",
     ],
diff --git a/tests/pipelines/ner/disorders/alcohol.py b/tests/pipelines/ner/disorders/alcohol.py
index 19f261f5b..f9fb65234 100644
--- a/tests/pipelines/ner/disorders/alcohol.py
+++ b/tests/pipelines/ner/disorders/alcohol.py
@@ -9,6 +9,10 @@
         True,
         True,
         True,
+        True,
+        True,
+        True,
+        False,
     ],
     detailled_status=[
         None,
@@ -20,6 +24,10 @@
         None,
         "ABSTINENCE",
         None,
+        None,
+        "ABSTINENCE",
+        None,
+        None,
     ],
     negation=[
         None,
@@ -27,10 +35,14 @@
         None,
         None,
         None,
-        None,
+        False,
         True,
         None,
         True,
+        False,
+        False,
+        False,
+        None,
     ],
     assign=None,
     texts=[
@@ -38,10 +50,14 @@
         "OH chronique.",
         "Prise d'alcool occasionnelle",
         "Application d'un pansement alcoolisé",
-        "Alcoolisme sevré",
-        "Alcoolisme non sevré",
+        "Présence d'un alcoolisme sevré",
+        "Présence d'un alcoolisme non sevré",
         "Alcool: 0",
         "Le patient est en cours de sevrage éthylotabagique",
         "Patient alcoolique: non.",
+        "On a un alcoolique non sevré depuis 10 ans.",
+        "Alcoolisme sevré",
+        "Alcoolisme non sevré",
+        "Dosage vitamines 25-OH",
     ],
 )
diff --git a/tests/pipelines/ner/disorders/test_all.py b/tests/pipelines/ner/disorders/test_all.py
index 7eca71125..8f37bc6e9 100644
--- a/tests/pipelines/ner/disorders/test_all.py
+++ b/tests/pipelines/ner/disorders/test_all.py
@@ -99,7 +99,10 @@ def check(self):
             for ent in ents:
                 assert ent.label_ == self.disorder
                 if negation is not None:
-                    assert ent._.negation == negation
+                    if negation:
+                        assert ent._.negation == negation
+                    else:
+                        assert ent._.negation is None
 
             if not ents:
                 continue
diff --git a/tests/pipelines/ner/disorders/tobacco.py b/tests/pipelines/ner/disorders/tobacco.py
index 4ea3fa934..e46b000e9 100644
--- a/tests/pipelines/ner/disorders/tobacco.py
+++ b/tests/pipelines/ner/disorders/tobacco.py
@@ -9,6 +9,7 @@
         True,
         True,
         True,
+        True,
     ],
     detailled_status=[
         None,
@@ -20,6 +21,7 @@
         "ABSTINENCE",
         None,
         None,
+        None,
     ],
     negation=[
         None,
@@ -31,6 +33,7 @@
         None,
         True,
         True,
+        False,
     ],
     assign=[{"PA": 15}] + 8 * [None],
     texts=[
@@ -38,10 +41,11 @@
         "Patient tabagique",
         "Tabagisme festif",
         "On a un tabagisme ancien",
-        "Tabac: 0",
-        "Tabagisme passif",
-        "Tabac: sevré depuis 5 ans",
+        "Pour le tabac: 0",
+        "Notion de tabagisme passif",
+        "Concernant le tabac: sevré depuis 5 ans",
         "Le patient ne fume aucun truc.",
         "Le patient fume 0 PA.",
+        "On a un tabagique non sevré depuis 10 ans.",
     ],
 )

From c1cf75077b646e4d7bfa9e4afbd3b829e9977910 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:46:56 +0200
Subject: [PATCH 06/12] docs: add details for disorders and behavior pipes

---
 docs/pipes/ner/behaviors/alcohol.md      |  2 +
 docs/pipes/ner/behaviors/index.md        | 97 +-----------------------
 docs/pipes/ner/disorders/index.md        | 56 +-------------
 docs/pipes/ner/disorders/presentation.md | 77 +++++++++++++++++++
 docs/pipes/ner/disorders/warning.md      |  7 ++
 5 files changed, 90 insertions(+), 149 deletions(-)
 create mode 100644 docs/pipes/ner/disorders/presentation.md
 create mode 100644 docs/pipes/ner/disorders/warning.md

diff --git a/docs/pipes/ner/behaviors/alcohol.md b/docs/pipes/ner/behaviors/alcohol.md
index f8c5772fb..be987c188 100644
--- a/docs/pipes/ner/behaviors/alcohol.md
+++ b/docs/pipes/ner/behaviors/alcohol.md
@@ -1,5 +1,7 @@
 # Alcohol consumption {: #edsnlp.pipes.ner.behaviors.alcohol.factory.create_component }
 
+--8<-- "docs/pipes/ner/disorders/warning.md"
+
 ::: edsnlp.pipes.ner.behaviors.alcohol.factory.create_component
     options:
         heading_level: 2
diff --git a/docs/pipes/ner/behaviors/index.md b/docs/pipes/ner/behaviors/index.md
index 8544c0255..aac46ed71 100644
--- a/docs/pipes/ner/behaviors/index.md
+++ b/docs/pipes/ner/behaviors/index.md
@@ -2,99 +2,6 @@
 
 ## Presentation
 
-EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the ContextualMatcher component.
-Some general considerations about those components:
+EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component.
 
-- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
-- The matched comorbidity is also available under the `ent.label_` of each match.
-- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
-- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
-- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
-  ```{ .python .no-check }
-  nlp.add_pipe(
-      eds.normalizer(
-          accents=True,
-          lowercase=True,
-          quotes=True,
-          spaces=True,
-          pollution=dict(
-              information=True,
-              bars=True,
-              biology=True,
-              doctors=True,
-              web=True,
-              coding=True,
-              footer=True,
-          ),
-      ),
-  )
-  ```
-
-!!! warning "Use qualifiers"
-    Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
-
-    !!! aphp "Use the ML model"
-
-        The model will soon be available in the models catalogue of AP-HP's CDW.
-
-## Usage
-
-```{ .python .no-check }
-import edsnlp, edsnlp.pipes as eds
-
-nlp = edsnlp.blank("eds")
-nlp.add_pipe(eds.sentences())
-nlp.add_pipe(
-    eds.normalizer(
-        accents=True,
-        lowercase=True,
-        quotes=True,
-        spaces=True,
-        pollution=dict(
-            information=True,
-            bars=True,
-            biology=True,
-            doctors=True,
-            web=True,
-            coding=True,
-            footer=True,
-        ),
-    ),
-)
-nlp.add_pipe(eds.tobacco())
-nlp.add_pipe(eds.diabetes())
-
-text = """
-Compte-rendu de consultation.
-
-Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique.
-Le patient va bien depuis la dernière fois.
-Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année).
-
-Sur le plan de son diabète, la glycémie est stable.
-"""
-
-doc = nlp(text)
-
-doc.spans
-# Out: {
-# 'pollutions': [],
-# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année],
-# 'diabetes': [rétinopathie diabétique, diabète]
-# }
-
-tobacco_matches = doc.spans["tobacco"]
-tobacco_matches[0]._.detailed_status
-# Out: "ABSTINENCE" #
-
-tobacco_matches[0]._.assigned["PA"]  # paquet-année
-# Out: 10 # (1)
-
-
-diabetes = doc.spans["diabetes"]
-(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status)
-# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (2)
-```
-
-1. Here we see an example of additional information that can be extracted
-2. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity.
+--8<-- "docs/pipes/ner/disorders/presentation.md"
diff --git a/docs/pipes/ner/disorders/index.md b/docs/pipes/ner/disorders/index.md
index e261fcd98..ad0321b0b 100644
--- a/docs/pipes/ner/disorders/index.md
+++ b/docs/pipes/ner/disorders/index.md
@@ -2,58 +2,6 @@
 
 ## Presentation
 
-The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the ContextualMatcher component.
+The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component.
 
-The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024]
-
-Some general considerations about those components:
-
-- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
-- The matched comorbidity is also available under the `ent.label_` of each match.
-- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
-- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
-- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
-
-    ```{ .python .no-check }
-    import edsnlp, edsnlp.pipes as eds
-    ...
-
-    nlp.add_pipe(
-        eds.normalizer(
-            accents=True,
-            lowercase=True,
-            quotes=True,
-            spaces=True,
-            pollution=dict(
-                information=True,
-                bars=True,
-                biology=True,
-                doctors=True,
-                web=True,
-                coding=True,
-                footer=True,
-            ),
-        ),
-    )
-    ```
-
-!!! warning "Use qualifiers"
-    Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
-
-    !!! aphp "Use the ML model"
-
-        The model will soon be available in the models catalogue of AP-HP's CDW.
-
-!!! tip "On the medical definition of the comorbidities"
-
-    Those components were developped to extract **chronic** and **symptomatic** conditions only.
-
-## Aggregation
-
-For relevant phenotyping, matches should be aggregated at the document-level. For instance, a document might mention a complicated diabetes at the beginning ("*Le patient a une rétinopathie diabétique*"), and then refer to this diabetes without mentionning that it is complicated anymore ("*Concernant son diabète, le patient ...*").
-Thus, a good and simple aggregation rule is, for each comorbidity, to
-
-- disregard all entities tagged as irrelevant by the qualification component(s)
-- take the maximum (i.e., the most severe) status of the leftover entities
-
-An implementation of this rule is presented [here][aggregating-results]
+--8<-- "docs/pipes/ner/disorders/presentation.md"
diff --git a/docs/pipes/ner/disorders/presentation.md b/docs/pipes/ner/disorders/presentation.md
new file mode 100644
index 000000000..1918867a8
--- /dev/null
+++ b/docs/pipes/ner/disorders/presentation.md
@@ -0,0 +1,77 @@
+The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024]
+
+Some general considerations about those components:
+
+- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
+- The matched comorbidity is also available under the `ent.label_` of each match.
+- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
+- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
+- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline (see [Usage](#usage) below)
+
+--8<-- "docs/pipes/ner/disorders/warning.md"
+
+!!! warning "Use qualifiers"
+    Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you should use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
+
+    !!! aphp "Use the ML model"
+
+        For projects working on AP-HP's CDW, this model is available via its models catalogue.
+
+## Usage
+
+```{ .python .no-check }
+import edsnlp, edsnlp.pipes as eds
+
+nlp = edsnlp.blank("eds")
+nlp.add_pipe(eds.sentences())
+nlp.add_pipe(
+    eds.normalizer(
+        accents=True,
+        lowercase=True,
+        quotes=True,
+        spaces=True,
+        pollution=dict(
+            biology=True, #(1)
+            coding=True, #(2)
+        ),
+    ),
+)
+nlp.add_pipe(eds.tobacco())
+nlp.add_pipe(eds.diabetes())
+
+text = """
+Compte-rendu de consultation.
+
+Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique.
+Le patient va bien depuis la dernière fois.
+Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année).
+
+Sur le plan de son diabète, la glycémie est stable.
+"""
+
+doc = nlp(text)
+
+doc.spans
+# Out: {
+# 'pollutions': [],
+# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année],
+# 'diabetes': [rétinopathie diabétique, diabète]
+# }
+
+tobacco_matches = doc.spans["tobacco"]
+tobacco_matches[0]._.detailed_status
+# Out: "ABSTINENCE" #
+
+tobacco_matches[0]._.assigned["PA"]  # paquet-année
+# Out: 10 # (3)
+
+
+diabetes = doc.spans["diabetes"]
+(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status)
+# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (4)
+```
+
+1. This will discard mentions of biology results, which often leads to false positive
+2. This will discard mentions of ICD10 coding that sometimes appears at the end of clinical documents
+3. Here we see an example of additional information that can be extracted
+4. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity.
diff --git a/docs/pipes/ner/disorders/warning.md b/docs/pipes/ner/disorders/warning.md
new file mode 100644
index 000000000..6268cf664
--- /dev/null
+++ b/docs/pipes/ner/disorders/warning.md
@@ -0,0 +1,7 @@
+!!! danger "On overlapping entities"
+    When using multiple disorders or behavior pipelines, some entities may be extracted from different pipes. For instance:
+
+    * "Intoxication éthylotabagique" will be tagged both by `eds.tobacco` and `eds.alcohol`
+    * "Chirrose alcoolique" will be tagged both by `eds.liver_disease` and `eds.alcohol`
+
+    As `doc.ents` discards overlapping entities, you should use `doc.spans` instead.

From 51d5d714d02b01e36c05b9f4c2f12e2f232a707c Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 09:50:47 +0200
Subject: [PATCH 07/12] chore: update changelog

---
 changelog.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/changelog.md b/changelog.md
index c88ce1d74..1d1ae7146 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## Unreleased
+
+### Added
+
+- `EDS.Tokenizer` now handles `-\n` (found in text when spliting a long word with a linebreak) as a specific token, which can be discarded by the normalizer pipe.
+
+### Fixed
+
+- Use `ubuntu-22` instead of `ubuntu-latest` in CI to keep `python 3.7` compatibility
+- When using `ignore_space_tokens=True`, words separated only by linebreaks will be collected (via `get_text()`) with spaces inbetween
+- The `process` method of `Qualifiers` now accepts `Span` as input, an treats it as a `Doc` to avoid alignment issues
+- The `detailed_status_mapping` of disorder/behavior pipes not handles the previous `KeyError: None` that can occur when loading pre-annotated docs without instanciating pipes beforehands
+- Various fixes on the Alcohol and Tobacco pipes
+
 ## v0.13.1
 
 ### Added

From 336c46354b6ec0cce066832f14d1b8a935b4b9e5 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Wed, 16 Oct 2024 11:14:34 +0200
Subject: [PATCH 08/12] fix: update pattern for intraword linebreak

---
 edsnlp/language.py                                 | 2 +-
 edsnlp/pipes/core/normalizer/pollution/patterns.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/edsnlp/language.py b/edsnlp/language.py
index 8a6df7e9c..01ccae292 100644
--- a/edsnlp/language.py
+++ b/edsnlp/language.py
@@ -48,7 +48,7 @@ class EDSLanguage(French):
     r"Mme\.",
     r"Mlle\.",
     r"(?i:(?:ep\.))",
-    r"-\n",
+    r"(?<![\W\d_])-\n",
 ]
 
 
diff --git a/edsnlp/pipes/core/normalizer/pollution/patterns.py b/edsnlp/pipes/core/normalizer/pollution/patterns.py
index 915a81131..15d7e7348 100644
--- a/edsnlp/pipes/core/normalizer/pollution/patterns.py
+++ b/edsnlp/pipes/core/normalizer/pollution/patterns.py
@@ -41,7 +41,7 @@
 footer += rf"|(.*{date}.*{ipp}.*)|(imprim.\sle\s{date}.*\d/\d.*\n?pat.*{date})"
 
 # Word split in the middle due to line break
-intraword_split = r"-\n"
+intraword_split = r"(?<![\W\d_])-\n"
 
 pollution = dict(
     information=information,

From fa337bc45cf315b73a8d67578ae1a2efce485a4f Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Fri, 25 Oct 2024 08:45:05 +0000
Subject: [PATCH 09/12] various changes

---
 edsnlp/core/pipeline.py                                   | 8 +++++++-
 edsnlp/pipes/trainable/span_classifier/span_classifier.py | 6 ++++++
 edsnlp/train.py                                           | 6 ++++++
 edsnlp/utils/span_getters.py                              | 3 +++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py
index 0a932be70..3e0660419 100644
--- a/edsnlp/core/pipeline.py
+++ b/edsnlp/core/pipeline.py
@@ -761,7 +761,13 @@ def to_disk(
         if (
             os.path.exists(path)
             and os.listdir(path)
-            and not os.path.exists(path / "config.cfg")
+            and not (
+                os.path.exists(path / "config.cfg") or
+                (
+                    os.path.exists(path / "meta.json") and
+                    os.path.exists(path / "tokenizer")
+                )
+            )
         ):
             raise Exception(
                 "The directory already exists and doesn't appear to be a"
diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
index b7111cbe2..f30a82caf 100644
--- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py
+++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
@@ -474,11 +474,17 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput:
         # - `negated=False` and `negated=True`
         for group_idx, bindings_indexer in enumerate(self.bindings_indexers):
             if "targets" in batch:
+                # print("BATCH")
+                # print(span_embeds.shape)
+                # print(batch.keys())
+                # print(batch["targets"][:,].shape)
+                # print(batch["targets"][:,].sum().item())
                 losses.append(
                     F.cross_entropy(
                         binding_scores[:, bindings_indexer],
                         batch["targets"][:, group_idx],
                         reduction="sum",
+                        weight=torch.tensor([0.7,1.9]).to(binding_scores.device)
                     )
                 )
                 assert not torch.isnan(losses[-1]).any(), "NaN loss"
diff --git a/edsnlp/train.py b/edsnlp/train.py
index 8f0c01821..1f22220dc 100644
--- a/edsnlp/train.py
+++ b/edsnlp/train.py
@@ -588,6 +588,8 @@ def train(
     all_metrics = []
     nlp.train(True)
     set_seed(seed)
+    
+    n_seen_samples = 0
 
     with RichTablePrinter(LOGGER_FIELDS, auto_refresh=False) as logger:
         with tqdm(
@@ -618,9 +620,13 @@ def train(
                 mini_batches = next(iterator)
                 optimizer.zero_grad()
                 for mini_batch in mini_batches:
+                    seen = False
                     loss = torch.zeros((), device=accelerator.device)
                     with nlp.cache():
                         for name, pipe in zip(pipe_names, trained_pipes):
+                            if not seen:
+                                n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0]
+                            print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {1 + (n_seen_samples // 4464)}")
                             output = pipe(mini_batch[name])
                             if "loss" in output:
                                 loss += output["loss"]
diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py
index ce07acc61..dc5b8330f 100644
--- a/edsnlp/utils/span_getters.py
+++ b/edsnlp/utils/span_getters.py
@@ -42,11 +42,14 @@ def get_spans(doc, span_getter):
     if callable(span_getter):
         yield from span_getter(doc)
         return
+    seen = set()
     for key, span_filter in span_getter.items():
         if key == "*":
             candidates = (span for group in doc.spans.values() for span in group)
         else:
             candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents
+        candidates = [candidate for candidate in candidates if hash(candidate) not in seen]
+        seen |= set(hash(candidate) for candidate in candidates)
         if span_filter is True:
             yield from candidates
         else:

From 464cd977539f393c827d6d8128c247261668f571 Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Fri, 25 Oct 2024 15:35:39 +0000
Subject: [PATCH 10/12] fix yielding last span + allow limited repeat in
 dataloader

---
 .../span_classifier/span_classifier.py        | 10 +--
 edsnlp/train.py                               | 63 +++++++++++++++++--
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
index f30a82caf..0400534be 100644
--- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py
+++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
@@ -474,17 +474,17 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput:
         # - `negated=False` and `negated=True`
         for group_idx, bindings_indexer in enumerate(self.bindings_indexers):
             if "targets" in batch:
-                # print("BATCH")
+                #print("BATCH")
                 # print(span_embeds.shape)
-                # print(batch.keys())
-                # print(batch["targets"][:,].shape)
-                # print(batch["targets"][:,].sum().item())
+                #print(batch.keys())
+                #print(batch["targets"][:,].shape)
+                #print(batch["targets"][:,].sum().item())
                 losses.append(
                     F.cross_entropy(
                         binding_scores[:, bindings_indexer],
                         batch["targets"][:, group_idx],
                         reduction="sum",
-                        weight=torch.tensor([0.7,1.9]).to(binding_scores.device)
+                        weight=torch.tensor([1.9, 0.7]).to(binding_scores.device)
                     )
                 )
                 assert not torch.isnan(losses[-1]).any(), "NaN loss"
diff --git a/edsnlp/train.py b/edsnlp/train.py
index 1f22220dc..3e958320a 100644
--- a/edsnlp/train.py
+++ b/edsnlp/train.py
@@ -153,6 +153,7 @@ def __init__(
         noise=1,
         drop_last=True,
         buffer_size: Optional[int] = None,
+        repeat: Optional[int] = None,
     ):
         self.dataset = dataset
         self.batch_size = batch_size
@@ -160,6 +161,7 @@ def __init__(
         self.noise = noise
         self.drop_last = drop_last
         self.buffer_size = buffer_size
+        self.repeat = repeat
 
     def __iter__(self):
         # Shuffle the dataset
@@ -193,6 +195,7 @@ def sample_len(idx, noise=True):
         def make_batches():
             total = 0
             batch = []
+            n_iter = 0
             for seq_size, idx in sorted_sequences:
                 if total and total + seq_size > self.batch_size:
                     yield batch
@@ -200,6 +203,8 @@ def make_batches():
                     batch = []
                 total += seq_size
                 batch.append(idx)
+            if not self.drop_last:
+                yield batch
 
         # Shuffle the batches in buffer that contain approximately
         # the full dataset to add more randomness
@@ -214,12 +219,12 @@ def make_batches():
         # Sort sequences by length +- some noise
         sorted_sequences = chain.from_iterable(
             sorted((sample_len(i), i) for i in range(len(self.dataset)))
-            for _ in repeat(None)
+            for _ in repeat(None, times=self.repeat)
         )
 
         # Batch sorted sequences
         batches = make_batches()
-        buffers = batchify(batches, buffer_size)
+        buffers = batchify(batches, buffer_size, drop_last=self.drop_last)
         for buffer in buffers:
             random.shuffle(buffer)
             yield from buffer
@@ -243,6 +248,7 @@ def __init__(self, nlp, embedding, grad_accumulation_max_tokens):
         self.nlp = nlp
         self.embedding: Transformer = embedding
         self.grad_accumulation_max_tokens = grad_accumulation_max_tokens
+        self.i = 0
 
     def __call__(self, seq):
         total = 0
@@ -483,6 +489,7 @@ def train(
     seed: int = 42,
     data_seed: int = 42,
     max_steps: int = 1000,
+    max_epochs: int | None = None,
     batch_size: BatchSizeArg = 2000,
     transformer_lr: float = 5e-5,
     task_lr: float = 3e-4,
@@ -499,6 +506,10 @@ def train(
         for module_name, module in pipe.named_component_modules()
         if isinstance(module, Transformer)
     )
+    assert not (max_steps and max_epochs), "Use only steps or epochs"
+    if max_epochs:
+        max_steps = int(0.9*(4464 / batch_size[0]))
+        
 
     set_seed(seed)
     # Loading and adapting the training and validation data
@@ -531,7 +542,38 @@ def train(
             trf_pipe,
             grad_accumulation_max_tokens=grad_accumulation_max_tokens,
         ),
+        shuffle=False,
     )
+    
+    true_steps = 0
+    # for b in iter(dataloader):
+    #     true_steps += b[0]["ecci_qualifier"]["targets"].shape[0]
+    # print(f"True: {true_steps} / Config: 4464")
+    # return
+    
+    batch_sampler=LengthSortedBatchSampler(
+        preprocessed,
+        batch_size=batch_size[0],
+        batch_unit=batch_size[1],
+    ),
+    #print("sampler", type(batch_sampler), len(batch_sampler), batch_sampler[0])
+    for b in batch_sampler:
+        init_batches = sorted([len(data1["ecci_qualifier/targets"]) for data1 in b.dataset])
+    
+    dataloader = torch.utils.data.DataLoader(
+        preprocessed,
+        batch_sampler=LengthSortedBatchSampler(
+            preprocessed,
+            batch_size=batch_size[0],
+            batch_unit=batch_size[1],
+        ),
+        collate_fn=SubBatchCollater(
+            nlp,
+            trf_pipe,
+            grad_accumulation_max_tokens=grad_accumulation_max_tokens,
+        ),
+    )
+    
     pipe_names, trained_pipes = zip(*nlp.torch_components())
     print("Training", ", ".join(pipe_names))
 
@@ -584,12 +626,14 @@ def train(
 
     cumulated_data = defaultdict(lambda: 0.0, count=0)
 
-    iterator = itertools.chain.from_iterable(itertools.repeat(dataloader))
+    iterator = iter(dataloader) #itertools.chain.from_iterable(itertools.repeat(dataloader))
     all_metrics = []
     nlp.train(True)
     set_seed(seed)
     
     n_seen_samples = 0
+    epoch = 0
+    true_batches = []
 
     with RichTablePrinter(LOGGER_FIELDS, auto_refresh=False) as logger:
         with tqdm(
@@ -599,6 +643,10 @@ def train(
             mininterval=5.0,
         ) as bar:
             for step in bar:
+                #print("step ", step)
+                if epoch > max_epochs:
+                    print(f"Done, left steps: {max_steps - step}")
+                    # break
                 if (step % validation_interval) == 0:
                     scores = scorer(nlp, val_docs)
                     all_metrics.append(
@@ -616,17 +664,21 @@ def train(
                     )
                     logger.log_metrics(flatten_dict(all_metrics[-1]))
                 if step == max_steps:
+                    print(f"Done, epoch {epoch}")
                     break
                 mini_batches = next(iterator)
                 optimizer.zero_grad()
                 for mini_batch in mini_batches:
+                    # print("mini", mini_batch["ecci_qualifier"]["targets"].shape[0])
+                    true_batches.append(mini_batch["ecci_qualifier"]["targets"].shape[0])
                     seen = False
                     loss = torch.zeros((), device=accelerator.device)
                     with nlp.cache():
                         for name, pipe in zip(pipe_names, trained_pipes):
                             if not seen:
                                 n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0]
-                            print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {1 + (n_seen_samples // 4464)}")
+                                epoch = 1 + (n_seen_samples / 4464)
+                            #print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {epoch}")
                             output = pipe(mini_batch[name])
                             if "loss" in output:
                                 loss += output["loss"]
@@ -640,6 +692,9 @@ def train(
 
                 torch.nn.utils.clip_grad_norm_(grad_params, max_grad_norm)
                 optimizer.step()
+    
+    print(init_batches)
+    print(sorted(true_batches))
 
     return nlp
 

From 514157cc95f8e369482f44f751f2225e11eab294 Mon Sep 17 00:00:00 2001
From: Thomas Petit-Jean <thomas.petitjean@aphp.fr>
Date: Thu, 14 Nov 2024 16:12:18 +0100
Subject: [PATCH 11/12] continue

---
 .../span_classifier/span_classifier.py        |  4 +
 edsnlp/train.py                               | 83 ++++++++++++++++++-
 2 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
index 0400534be..61f8ce6e5 100644
--- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py
+++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
@@ -484,7 +484,11 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput:
                         binding_scores[:, bindings_indexer],
                         batch["targets"][:, group_idx],
                         reduction="sum",
+<<<<<<< HEAD
                         weight=torch.tensor([1.9, 0.7]).to(binding_scores.device)
+=======
+                        weight=torch.tensor([1.9, 0.7]).to(binding_scores.device),
+>>>>>>> cc94186fc (continue)
                     )
                 )
                 assert not torch.isnan(losses[-1]).any(), "NaN loss"
diff --git a/edsnlp/train.py b/edsnlp/train.py
index 3e958320a..fe48b24c3 100644
--- a/edsnlp/train.py
+++ b/edsnlp/train.py
@@ -1,4 +1,3 @@
-import itertools
 import json
 import math
 import random
@@ -143,6 +142,9 @@ class LengthSortedBatchSampler:
     buffer_size: Optional[int]
         The size of the buffer to use to shuffle the batches. If None, the buffer
         will be approximately the size of the dataset.
+    repeat: Optional[int]
+        How many time will the sampler iterate over the dataset. If None,
+        iterates indefinitely.
     """
 
     def __init__(
@@ -151,7 +153,7 @@ def __init__(
         batch_size: int,
         batch_unit: str,
         noise=1,
-        drop_last=True,
+        drop_last=False,
         buffer_size: Optional[int] = None,
         repeat: Optional[int] = None,
     ):
@@ -162,6 +164,12 @@ def __init__(
         self.drop_last = drop_last
         self.buffer_size = buffer_size
         self.repeat = repeat
+<<<<<<< HEAD
+=======
+
+    def set_repeat(self, repeat):
+        self.repeat = repeat
+>>>>>>> cc94186fc (continue)
 
     def __iter__(self):
         # Shuffle the dataset
@@ -183,6 +191,7 @@ def sample_len(idx, noise=True):
         elif self.batch_unit == "spans":
 
             def sample_len(idx, noise=True):
+                # TODO: implement noise here ?
                 return len(
                     next(
                         v for k, v in self.dataset[idx].items() if k.endswith("begins")
@@ -251,6 +260,10 @@ def __init__(self, nlp, embedding, grad_accumulation_max_tokens):
         self.i = 0
 
     def __call__(self, seq):
+        mini_batches = self.get_mini_batches(seq)
+        return [self.nlp.collate(b) for b in mini_batches]
+
+    def get_mini_batches(self, seq):
         total = 0
         mini_batches = [[]]
         for sample_features in seq:
@@ -269,7 +282,7 @@ def __call__(self, seq):
                 mini_batches.append([])
             total += num_tokens
             mini_batches[-1].append(sample_features)
-        return [self.nlp.collate(b) for b in mini_batches]
+        return mini_batches
 
 
 def subset_doc(doc: Doc, start: int, end: int) -> Doc:
@@ -508,8 +521,12 @@ def train(
     )
     assert not (max_steps and max_epochs), "Use only steps or epochs"
     if max_epochs:
+<<<<<<< HEAD
         max_steps = int(0.9*(4464 / batch_size[0]))
         
+=======
+        max_steps = int(0.9 * (4464 / batch_size[0]))
+>>>>>>> cc94186fc (continue)
 
     set_seed(seed)
     # Loading and adapting the training and validation data
@@ -530,8 +547,33 @@ def train(
             show_progress=True
         )
     )
+
+    batch_sampler = LengthSortedBatchSampler(
+        preprocessed,
+        batch_size=batch_size[0],
+        batch_unit=batch_size[1],
+        repeat=max_epochs,
+    )
+    collate_fn = SubBatchCollater(
+        nlp,
+        trf_pipe,
+        grad_accumulation_max_tokens=grad_accumulation_max_tokens,
+    )
+
+    if max_epochs is not None:
+        # we have to make a dry run
+        batch_sampler.set_repeat(repeat=1)  # single epoch
+        for batch in batch_sampler:
+            batch_collated = collate_fn.get_mini_batches(batch)
+            n_true_steps = len(batch_collated)
+            print(f"True number of steps: {n_true_steps}")
+            max_steps = max_epochs * n_true_steps
+            # TODO show mean batch size ?
+        batch_sampler.set_repeat(repeat=repeat)
+
     dataloader = torch.utils.data.DataLoader(
         preprocessed,
+<<<<<<< HEAD
         batch_sampler=LengthSortedBatchSampler(
             preprocessed,
             batch_size=batch_size[0],
@@ -574,6 +616,13 @@ def train(
         ),
     )
     
+=======
+        batch_sampler=batch_sampler,
+        collate_fn=collate_fn,
+        shuffle=False,
+    )
+
+>>>>>>> cc94186fc (continue)
     pipe_names, trained_pipes = zip(*nlp.torch_components())
     print("Training", ", ".join(pipe_names))
 
@@ -626,11 +675,16 @@ def train(
 
     cumulated_data = defaultdict(lambda: 0.0, count=0)
 
+<<<<<<< HEAD
     iterator = iter(dataloader) #itertools.chain.from_iterable(itertools.repeat(dataloader))
+=======
+    # TODO: maybe back to: itertools.chain.from_iterable(itertools.repeat(dataloader))
+    iterator = iter(dataloader)
+>>>>>>> cc94186fc (continue)
     all_metrics = []
     nlp.train(True)
     set_seed(seed)
-    
+
     n_seen_samples = 0
     epoch = 0
     true_batches = []
@@ -643,8 +697,13 @@ def train(
             mininterval=5.0,
         ) as bar:
             for step in bar:
+<<<<<<< HEAD
                 #print("step ", step)
                 if epoch > max_epochs:
+=======
+                # print("step ", step)
+                if max_epochs and (epoch > max_epochs):
+>>>>>>> cc94186fc (continue)
                     print(f"Done, left steps: {max_steps - step}")
                     # break
                 if (step % validation_interval) == 0:
@@ -670,15 +729,29 @@ def train(
                 optimizer.zero_grad()
                 for mini_batch in mini_batches:
                     # print("mini", mini_batch["ecci_qualifier"]["targets"].shape[0])
+<<<<<<< HEAD
                     true_batches.append(mini_batch["ecci_qualifier"]["targets"].shape[0])
+=======
+                    true_batches.append(
+                        mini_batch["ecci_qualifier"]["targets"].shape[0]
+                    )
+>>>>>>> cc94186fc (continue)
                     seen = False
                     loss = torch.zeros((), device=accelerator.device)
                     with nlp.cache():
                         for name, pipe in zip(pipe_names, trained_pipes):
                             if not seen:
+<<<<<<< HEAD
                                 n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0]
                                 epoch = 1 + (n_seen_samples / 4464)
                             #print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {epoch}")
+=======
+                                n_seen_samples += mini_batch["ecci_qualifier"][
+                                    "targets"
+                                ].shape[0]
+                                epoch = 1 + (n_seen_samples / 4464)
+                            # print(f"{step} - Seen:{n_seen_samples} - Epoch:{epoch}")
+>>>>>>> cc94186fc (continue)
                             output = pipe(mini_batch[name])
                             if "loss" in output:
                                 loss += output["loss"]
@@ -696,6 +769,8 @@ def train(
     print(init_batches)
     print(sorted(true_batches))
 
+    print(sorted(true_batches))
+
     return nlp
 
 

From e08483d0dd9f20e78111602316670781a535f12d Mon Sep 17 00:00:00 2001
From: Thomas PETIT-JEAN <thomas.petitjean@aphp.fr>
Date: Mon, 18 Nov 2024 13:48:49 +0000
Subject: [PATCH 12/12] fix merge conflicts

---
 .../span_classifier/span_classifier.py        |  4 -
 edsnlp/train.py                               | 73 +------------------
 2 files changed, 2 insertions(+), 75 deletions(-)

diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
index 61f8ce6e5..0973ed171 100644
--- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py
+++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
@@ -484,11 +484,7 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput:
                         binding_scores[:, bindings_indexer],
                         batch["targets"][:, group_idx],
                         reduction="sum",
-<<<<<<< HEAD
-                        weight=torch.tensor([1.9, 0.7]).to(binding_scores.device)
-=======
                         weight=torch.tensor([1.9, 0.7]).to(binding_scores.device),
->>>>>>> cc94186fc (continue)
                     )
                 )
                 assert not torch.isnan(losses[-1]).any(), "NaN loss"
diff --git a/edsnlp/train.py b/edsnlp/train.py
index fe48b24c3..51c4ccecc 100644
--- a/edsnlp/train.py
+++ b/edsnlp/train.py
@@ -164,12 +164,9 @@ def __init__(
         self.drop_last = drop_last
         self.buffer_size = buffer_size
         self.repeat = repeat
-<<<<<<< HEAD
-=======
 
     def set_repeat(self, repeat):
         self.repeat = repeat
->>>>>>> cc94186fc (continue)
 
     def __iter__(self):
         # Shuffle the dataset
@@ -521,13 +518,7 @@ def train(
     )
     assert not (max_steps and max_epochs), "Use only steps or epochs"
     if max_epochs:
-<<<<<<< HEAD
-        max_steps = int(0.9*(4464 / batch_size[0]))
-        
-=======
         max_steps = int(0.9 * (4464 / batch_size[0]))
->>>>>>> cc94186fc (continue)
-
     set_seed(seed)
     # Loading and adapting the training and validation data
     with set_seed(data_seed):
@@ -570,10 +561,9 @@ def train(
             max_steps = max_epochs * n_true_steps
             # TODO show mean batch size ?
         batch_sampler.set_repeat(repeat=repeat)
-
+        
     dataloader = torch.utils.data.DataLoader(
         preprocessed,
-<<<<<<< HEAD
         batch_sampler=LengthSortedBatchSampler(
             preprocessed,
             batch_size=batch_size[0],
@@ -587,42 +577,6 @@ def train(
         shuffle=False,
     )
     
-    true_steps = 0
-    # for b in iter(dataloader):
-    #     true_steps += b[0]["ecci_qualifier"]["targets"].shape[0]
-    # print(f"True: {true_steps} / Config: 4464")
-    # return
-    
-    batch_sampler=LengthSortedBatchSampler(
-        preprocessed,
-        batch_size=batch_size[0],
-        batch_unit=batch_size[1],
-    ),
-    #print("sampler", type(batch_sampler), len(batch_sampler), batch_sampler[0])
-    for b in batch_sampler:
-        init_batches = sorted([len(data1["ecci_qualifier/targets"]) for data1 in b.dataset])
-    
-    dataloader = torch.utils.data.DataLoader(
-        preprocessed,
-        batch_sampler=LengthSortedBatchSampler(
-            preprocessed,
-            batch_size=batch_size[0],
-            batch_unit=batch_size[1],
-        ),
-        collate_fn=SubBatchCollater(
-            nlp,
-            trf_pipe,
-            grad_accumulation_max_tokens=grad_accumulation_max_tokens,
-        ),
-    )
-    
-=======
-        batch_sampler=batch_sampler,
-        collate_fn=collate_fn,
-        shuffle=False,
-    )
-
->>>>>>> cc94186fc (continue)
     pipe_names, trained_pipes = zip(*nlp.torch_components())
     print("Training", ", ".join(pipe_names))
 
@@ -675,12 +629,8 @@ def train(
 
     cumulated_data = defaultdict(lambda: 0.0, count=0)
 
-<<<<<<< HEAD
-    iterator = iter(dataloader) #itertools.chain.from_iterable(itertools.repeat(dataloader))
-=======
     # TODO: maybe back to: itertools.chain.from_iterable(itertools.repeat(dataloader))
     iterator = iter(dataloader)
->>>>>>> cc94186fc (continue)
     all_metrics = []
     nlp.train(True)
     set_seed(seed)
@@ -697,15 +647,9 @@ def train(
             mininterval=5.0,
         ) as bar:
             for step in bar:
-<<<<<<< HEAD
-                #print("step ", step)
-                if epoch > max_epochs:
-=======
-                # print("step ", step)
                 if max_epochs and (epoch > max_epochs):
->>>>>>> cc94186fc (continue)
                     print(f"Done, left steps: {max_steps - step}")
-                    # break
+                    break
                 if (step % validation_interval) == 0:
                     scores = scorer(nlp, val_docs)
                     all_metrics.append(
@@ -728,30 +672,19 @@ def train(
                 mini_batches = next(iterator)
                 optimizer.zero_grad()
                 for mini_batch in mini_batches:
-                    # print("mini", mini_batch["ecci_qualifier"]["targets"].shape[0])
-<<<<<<< HEAD
-                    true_batches.append(mini_batch["ecci_qualifier"]["targets"].shape[0])
-=======
                     true_batches.append(
                         mini_batch["ecci_qualifier"]["targets"].shape[0]
                     )
->>>>>>> cc94186fc (continue)
                     seen = False
                     loss = torch.zeros((), device=accelerator.device)
                     with nlp.cache():
                         for name, pipe in zip(pipe_names, trained_pipes):
                             if not seen:
-<<<<<<< HEAD
-                                n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0]
-                                epoch = 1 + (n_seen_samples / 4464)
-                            #print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {epoch}")
-=======
                                 n_seen_samples += mini_batch["ecci_qualifier"][
                                     "targets"
                                 ].shape[0]
                                 epoch = 1 + (n_seen_samples / 4464)
                             # print(f"{step} - Seen:{n_seen_samples} - Epoch:{epoch}")
->>>>>>> cc94186fc (continue)
                             output = pipe(mini_batch[name])
                             if "loss" in output:
                                 loss += output["loss"]
@@ -769,8 +702,6 @@ def train(
     print(init_batches)
     print(sorted(true_batches))
 
-    print(sorted(true_batches))
-
     return nlp