diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 06aac1ac1..d182c0ae0 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -13,7 +13,7 @@ env:
 
 jobs:
   Documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v2
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 65de04d5a..b6b9198d0 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-22.04, windows-latest, macos-latest]
 
     steps:
       - uses: actions/checkout@v4
@@ -42,7 +42,7 @@ jobs:
 
   build_sdist:
     name: Build source distribution
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
 
@@ -58,7 +58,7 @@ jobs:
     name: Upload to PyPI
 
     needs: [build_wheels, build_sdist]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/download-artifact@v4
@@ -76,7 +76,7 @@ jobs:
         # repository_url: https://test.pypi.org/legacy/
 
   Documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v3
 
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index 0b849728b..ac64a28b3 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
+        os: [ubuntu-22.04, windows-latest, macos-latest]
 
     steps:
       - uses: actions/checkout@v2
@@ -30,7 +30,7 @@ jobs:
 
   build_sdist:
     name: Build source distribution
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 728547434..f55139d32 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,7 +15,7 @@ jobs:
   linting:
     name: Linting
     if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
         with:
@@ -32,7 +32,7 @@ jobs:
 
   pytest:
     name: Pytest
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       fail-fast: true
       matrix:
@@ -120,7 +120,7 @@ jobs:
 
   documentation:
     name: Documentation
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v2
 
@@ -150,7 +150,7 @@ jobs:
 
   simple-installation:
     name: Simple installation
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       fail-fast: true
       matrix:
diff --git a/changelog.md b/changelog.md
index c88ce1d74..1d1ae7146 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## Unreleased
+
+### Added
+
+- `EDS.Tokenizer` now handles `-\n` (found in text when spliting a long word with a linebreak) as a specific token, which can be discarded by the normalizer pipe.
+
+### Fixed
+
+- Use `ubuntu-22` instead of `ubuntu-latest` in CI to keep `python 3.7` compatibility
+- When using `ignore_space_tokens=True`, words separated only by linebreaks will be collected (via `get_text()`) with spaces inbetween
+- The `process` method of `Qualifiers` now accepts `Span` as input, an treats it as a `Doc` to avoid alignment issues
+- The `detailed_status_mapping` of disorder/behavior pipes not handles the previous `KeyError: None` that can occur when loading pre-annotated docs without instanciating pipes beforehands
+- Various fixes on the Alcohol and Tobacco pipes
+
 ## v0.13.1
 
 ### Added
diff --git a/docs/pipes/ner/behaviors/alcohol.md b/docs/pipes/ner/behaviors/alcohol.md
index f8c5772fb..be987c188 100644
--- a/docs/pipes/ner/behaviors/alcohol.md
+++ b/docs/pipes/ner/behaviors/alcohol.md
@@ -1,5 +1,7 @@
 # Alcohol consumption {: #edsnlp.pipes.ner.behaviors.alcohol.factory.create_component }
 
+--8<-- "docs/pipes/ner/disorders/warning.md"
+
 ::: edsnlp.pipes.ner.behaviors.alcohol.factory.create_component
     options:
         heading_level: 2
diff --git a/docs/pipes/ner/behaviors/index.md b/docs/pipes/ner/behaviors/index.md
index 8544c0255..aac46ed71 100644
--- a/docs/pipes/ner/behaviors/index.md
+++ b/docs/pipes/ner/behaviors/index.md
@@ -2,99 +2,6 @@
 
 ## Presentation
 
-EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the ContextualMatcher component.
-Some general considerations about those components:
+EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component.
 
-- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
-- The matched comorbidity is also available under the `ent.label_` of each match.
-- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
-- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
-- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
-  ```{ .python .no-check }
-  nlp.add_pipe(
-      eds.normalizer(
-          accents=True,
-          lowercase=True,
-          quotes=True,
-          spaces=True,
-          pollution=dict(
-              information=True,
-              bars=True,
-              biology=True,
-              doctors=True,
-              web=True,
-              coding=True,
-              footer=True,
-          ),
-      ),
-  )
-  ```
-
-!!! warning "Use qualifiers"
-    Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
-
-    !!! aphp "Use the ML model"
-
-        The model will soon be available in the models catalogue of AP-HP's CDW.
-
-## Usage
-
-```{ .python .no-check }
-import edsnlp, edsnlp.pipes as eds
-
-nlp = edsnlp.blank("eds")
-nlp.add_pipe(eds.sentences())
-nlp.add_pipe(
-    eds.normalizer(
-        accents=True,
-        lowercase=True,
-        quotes=True,
-        spaces=True,
-        pollution=dict(
-            information=True,
-            bars=True,
-            biology=True,
-            doctors=True,
-            web=True,
-            coding=True,
-            footer=True,
-        ),
-    ),
-)
-nlp.add_pipe(eds.tobacco())
-nlp.add_pipe(eds.diabetes())
-
-text = """
-Compte-rendu de consultation.
-
-Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique.
-Le patient va bien depuis la dernière fois.
-Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année).
-
-Sur le plan de son diabète, la glycémie est stable.
-"""
-
-doc = nlp(text)
-
-doc.spans
-# Out: {
-# 'pollutions': [],
-# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année],
-# 'diabetes': [rétinopathie diabétique, diabète]
-# }
-
-tobacco_matches = doc.spans["tobacco"]
-tobacco_matches[0]._.detailed_status
-# Out: "ABSTINENCE" #
-
-tobacco_matches[0]._.assigned["PA"]  # paquet-année
-# Out: 10 # (1)
-
-
-diabetes = doc.spans["diabetes"]
-(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status)
-# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (2)
-```
-
-1. Here we see an example of additional information that can be extracted
-2. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity.
+--8<-- "docs/pipes/ner/disorders/presentation.md"
diff --git a/docs/pipes/ner/disorders/index.md b/docs/pipes/ner/disorders/index.md
index e261fcd98..ad0321b0b 100644
--- a/docs/pipes/ner/disorders/index.md
+++ b/docs/pipes/ner/disorders/index.md
@@ -2,58 +2,6 @@
 
 ## Presentation
 
-The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the ContextualMatcher component.
+The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component.
 
-The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024]
-
-Some general considerations about those components:
-
-- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
-- The matched comorbidity is also available under the `ent.label_` of each match.
-- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
-- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
-- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
-
-    ```{ .python .no-check }
-    import edsnlp, edsnlp.pipes as eds
-    ...
-
-    nlp.add_pipe(
-        eds.normalizer(
-            accents=True,
-            lowercase=True,
-            quotes=True,
-            spaces=True,
-            pollution=dict(
-                information=True,
-                bars=True,
-                biology=True,
-                doctors=True,
-                web=True,
-                coding=True,
-                footer=True,
-            ),
-        ),
-    )
-    ```
-
-!!! warning "Use qualifiers"
-    Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
-
-    !!! aphp "Use the ML model"
-
-        The model will soon be available in the models catalogue of AP-HP's CDW.
-
-!!! tip "On the medical definition of the comorbidities"
-
-    Those components were developped to extract **chronic** and **symptomatic** conditions only.
-
-## Aggregation
-
-For relevant phenotyping, matches should be aggregated at the document-level. For instance, a document might mention a complicated diabetes at the beginning ("*Le patient a une rétinopathie diabétique*"), and then refer to this diabetes without mentionning that it is complicated anymore ("*Concernant son diabète, le patient ...*").
-Thus, a good and simple aggregation rule is, for each comorbidity, to
-
-- disregard all entities tagged as irrelevant by the qualification component(s)
-- take the maximum (i.e., the most severe) status of the leftover entities
-
-An implementation of this rule is presented [here][aggregating-results]
+--8<-- "docs/pipes/ner/disorders/presentation.md"
diff --git a/docs/pipes/ner/disorders/presentation.md b/docs/pipes/ner/disorders/presentation.md
new file mode 100644
index 000000000..1918867a8
--- /dev/null
+++ b/docs/pipes/ner/disorders/presentation.md
@@ -0,0 +1,77 @@
+The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024]
+
+Some general considerations about those components:
+
+- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
+- The matched comorbidity is also available under the `ent.label_` of each match.
+- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
+- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
+- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline (see [Usage](#usage) below)
+
+--8<-- "docs/pipes/ner/disorders/warning.md"
+
+!!! warning "Use qualifiers"
+    Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you should use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
+
+    !!! aphp "Use the ML model"
+
+        For projects working on AP-HP's CDW, this model is available via its models catalogue.
+
+## Usage
+
+```{ .python .no-check }
+import edsnlp, edsnlp.pipes as eds
+
+nlp = edsnlp.blank("eds")
+nlp.add_pipe(eds.sentences())
+nlp.add_pipe(
+    eds.normalizer(
+        accents=True,
+        lowercase=True,
+        quotes=True,
+        spaces=True,
+        pollution=dict(
+            biology=True, #(1)
+            coding=True, #(2)
+        ),
+    ),
+)
+nlp.add_pipe(eds.tobacco())
+nlp.add_pipe(eds.diabetes())
+
+text = """
+Compte-rendu de consultation.
+
+Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique.
+Le patient va bien depuis la dernière fois.
+Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année).
+
+Sur le plan de son diabète, la glycémie est stable.
+"""
+
+doc = nlp(text)
+
+doc.spans
+# Out: {
+# 'pollutions': [],
+# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année],
+# 'diabetes': [rétinopathie diabétique, diabète]
+# }
+
+tobacco_matches = doc.spans["tobacco"]
+tobacco_matches[0]._.detailed_status
+# Out: "ABSTINENCE" #
+
+tobacco_matches[0]._.assigned["PA"]  # paquet-année
+# Out: 10 # (3)
+
+
+diabetes = doc.spans["diabetes"]
+(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status)
+# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (4)
+```
+
+1. This will discard mentions of biology results, which often leads to false positive
+2. This will discard mentions of ICD10 coding that sometimes appears at the end of clinical documents
+3. Here we see an example of additional information that can be extracted
+4. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity.
diff --git a/docs/pipes/ner/disorders/warning.md b/docs/pipes/ner/disorders/warning.md
new file mode 100644
index 000000000..6268cf664
--- /dev/null
+++ b/docs/pipes/ner/disorders/warning.md
@@ -0,0 +1,7 @@
+!!! danger "On overlapping entities"
+    When using multiple disorders or behavior pipelines, some entities may be extracted from different pipes. For instance:
+
+    * "Intoxication éthylotabagique" will be tagged both by `eds.tobacco` and `eds.alcohol`
+    * "Chirrose alcoolique" will be tagged both by `eds.liver_disease` and `eds.alcohol`
+
+    As `doc.ents` discards overlapping entities, you should use `doc.spans` instead.
diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py
index 0a932be70..3e0660419 100644
--- a/edsnlp/core/pipeline.py
+++ b/edsnlp/core/pipeline.py
@@ -761,7 +761,13 @@ def to_disk(
         if (
             os.path.exists(path)
             and os.listdir(path)
-            and not os.path.exists(path / "config.cfg")
+            and not (
+                os.path.exists(path / "config.cfg") or
+                (
+                    os.path.exists(path / "meta.json") and
+                    os.path.exists(path / "tokenizer")
+                )
+            )
         ):
             raise Exception(
                 "The directory already exists and doesn't appear to be a"
diff --git a/edsnlp/language.py b/edsnlp/language.py
index f61c804b3..01ccae292 100644
--- a/edsnlp/language.py
+++ b/edsnlp/language.py
@@ -41,7 +41,15 @@ class EDSLanguage(French):
     Defaults = EDSDefaults
 
 
-TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))"]
+TOKENIZER_EXCEPTIONS = [
+    r"Dr\.",
+    r"Pr\.",
+    r"M\.",
+    r"Mme\.",
+    r"Mlle\.",
+    r"(?i:(?:ep\.))",
+    r"(?<![\W\d_])-\n",
+]
 
 
 class EDSTokenizer(Tokenizer):
diff --git a/edsnlp/pipes/core/normalizer/pollution/patterns.py b/edsnlp/pipes/core/normalizer/pollution/patterns.py
index ad5ae5100..15d7e7348 100644
--- a/edsnlp/pipes/core/normalizer/pollution/patterns.py
+++ b/edsnlp/pipes/core/normalizer/pollution/patterns.py
@@ -40,6 +40,9 @@
 footer = rf"(?i)({page}.*\n?pat.*(ipp)?.*\n?(courrier valid.*)?)"
 footer += rf"|(.*{date}.*{ipp}.*)|(imprim.\sle\s{date}.*\d/\d.*\n?pat.*{date})"
 
+# Word split in the middle due to line break
+intraword_split = r"(?<![\W\d_])-\n"
+
 pollution = dict(
     information=information,
     bars=bars,
@@ -48,6 +51,7 @@
     web=web,
     coding=coding,
     footer=footer,
+    intraword_split=intraword_split,
 )
 
 default_enabled = dict(
@@ -58,4 +62,5 @@
     web=True,
     coding=False,
     footer=True,
+    intraword_split=True,
 )
diff --git a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py
index ca543fa6e..c1bbc9177 100644
--- a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py
+++ b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py
@@ -118,8 +118,8 @@ def process(self, doc: Doc) -> List[Span]:
                 stopped = self.negation.process(span)
                 if not any(stopped_token.negation for stopped_token in stopped.tokens):
                     span._.status = 2
-
-            if "zero_after" in span._.assigned.keys():
-                span._.negation = True
+            else:
+                if "zero_after" in span._.assigned.keys():
+                    span._.negation = True
 
             yield span
diff --git a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py
index 38c795926..7777225d3 100644
--- a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py
+++ b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py
@@ -3,7 +3,7 @@
     regex=[
         r"\balco[ol]",
         r"\bethyl",
-        r"(?<!(25.?)|(sevrage)).?\boh\b",
+        r"(?<!(25.{0,10}))\boh\b",
         r"exogenose",
         r"delirium.tremens",
     ],
diff --git a/edsnlp/pipes/ner/disorders/base.py b/edsnlp/pipes/ner/disorders/base.py
index 06d25358d..54855dbea 100644
--- a/edsnlp/pipes/ner/disorders/base.py
+++ b/edsnlp/pipes/ner/disorders/base.py
@@ -113,8 +113,17 @@ def __call__(self, doc: Doc) -> Doc:
             annotated spaCy Doc object
         """
         spans = list(self.process(doc))
+        all_detailed_status = set(self.detailed_status_mapping.keys())
         for span in spans:
-            span._.detailed_status = self.detailed_status_mapping[span._.status]
+            if span._.status is not None and span._.status not in all_detailed_status:
+                raise ValueError(
+                    f"Got incorrect status value for '{span}'. Expected "
+                    f"None or one of {all_detailed_status}, got {span._.status}"
+                )
+            span._.detailed_status = self.detailed_status_mapping.get(
+                span._.status,
+                None,
+            )
 
         self.set_spans(doc, filter_spans(spans))
 
diff --git a/edsnlp/pipes/qualifiers/base.py b/edsnlp/pipes/qualifiers/base.py
index 521baf0e6..0b41fac9e 100644
--- a/edsnlp/pipes/qualifiers/base.py
+++ b/edsnlp/pipes/qualifiers/base.py
@@ -172,9 +172,13 @@ def get_matches(self, doc: Doc) -> List[Span]:
 
         return list(matches)
 
-    def process(self, doc: Doc) -> BaseQualifierResults:
+    def ensure_doc(self, doc: Union[Doc, Span]) -> Doc:
+        return doc if not hasattr(doc, "as_doc") else doc.as_doc()
+
+    def process(self, doc_like: Union[Doc, Span]) -> BaseQualifierResults:
+        doc_like = self.ensure_doc(doc_like)  # pragma: no cover
         raise NotImplementedError
 
     def __call__(self, doc: Doc) -> Doc:
-        results = self.process(doc)
+        results = self.process(doc)  # pragma: no cover
         raise NotImplementedError(f"{type(results)} should be used to tag the document")
diff --git a/edsnlp/pipes/qualifiers/family/family.py b/edsnlp/pipes/qualifiers/family/family.py
index e979e71de..12943d9eb 100644
--- a/edsnlp/pipes/qualifiers/family/family.py
+++ b/edsnlp/pipes/qualifiers/family/family.py
@@ -187,7 +187,8 @@ def set_extensions(self) -> None:
         if not Doc.has_extension("family"):
             Doc.set_extension("family", default=[])
 
-    def process(self, doc: Doc) -> FamilyResults:
+    def process(self, doc_like: Union[Doc, Span]) -> FamilyResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
 
         terminations = [m for m in matches if m.label_ == "termination"]
diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py
index 2dc8d56d8..ab5469c95 100644
--- a/edsnlp/pipes/qualifiers/history/history.py
+++ b/edsnlp/pipes/qualifiers/history/history.py
@@ -326,7 +326,8 @@ def set_extensions(self) -> None:
                 getter=deprecated_getter_factory("antecedent_cues", "history_cues"),
             )
 
-    def process(self, doc: Doc) -> HistoryResults:
+    def process(self, doc_like: Union[Doc, Span]) -> HistoryResults:
+        doc = self.ensure_doc(doc_like)
         note_datetime = None
         if doc._.note_datetime is not None:
             try:
diff --git a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py
index 924d2cf63..68ed1d1d1 100644
--- a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py
+++ b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py
@@ -262,7 +262,8 @@ def load_verbs(
             list_hypo_verbs_following,
         )
 
-    def process(self, doc: Doc) -> HypothesisResults:
+    def process(self, doc_like: Union[Doc, Span]) -> HypothesisResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
 
         terminations = [m for m in matches if m.label_ == "termination"]
diff --git a/edsnlp/pipes/qualifiers/negation/negation.py b/edsnlp/pipes/qualifiers/negation/negation.py
index fb2c7878f..ea4a4fc40 100644
--- a/edsnlp/pipes/qualifiers/negation/negation.py
+++ b/edsnlp/pipes/qualifiers/negation/negation.py
@@ -295,7 +295,8 @@ def __call__(self, doc: Doc) -> Doc:
                     token._.negation = True
         return doc
 
-    def process(self, doc: Doc) -> NegationResults:
+    def process(self, doc_like: Union[Doc, Span]) -> NegationResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
 
         terminations = [m for m in matches if m.label_ == "termination"]
diff --git a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py
index 77b0cbe91..759eb7091 100644
--- a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py
+++ b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py
@@ -226,7 +226,8 @@ def load_verbs(self, verbs: List[str]) -> List[str]:
 
         return list_rep_verbs
 
-    def process(self, doc: Doc) -> ReportedSpeechResults:
+    def process(self, doc_like: Union[Doc, Span]) -> ReportedSpeechResults:
+        doc = self.ensure_doc(doc_like)
         matches = self.get_matches(doc)
         matches += list(self.regex_matcher(doc, as_spans=True))
 
diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
index b7111cbe2..0973ed171 100644
--- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py
+++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py
@@ -474,11 +474,17 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput:
         # - `negated=False` and `negated=True`
         for group_idx, bindings_indexer in enumerate(self.bindings_indexers):
             if "targets" in batch:
+                #print("BATCH")
+                # print(span_embeds.shape)
+                #print(batch.keys())
+                #print(batch["targets"][:,].shape)
+                #print(batch["targets"][:,].sum().item())
                 losses.append(
                     F.cross_entropy(
                         binding_scores[:, bindings_indexer],
                         batch["targets"][:, group_idx],
                         reduction="sum",
+                        weight=torch.tensor([1.9, 0.7]).to(binding_scores.device),
                     )
                 )
                 assert not torch.isnan(losses[-1]).any(), "NaN loss"
diff --git a/edsnlp/train.py b/edsnlp/train.py
index 8f0c01821..51c4ccecc 100644
--- a/edsnlp/train.py
+++ b/edsnlp/train.py
@@ -1,4 +1,3 @@
-import itertools
 import json
 import math
 import random
@@ -143,6 +142,9 @@ class LengthSortedBatchSampler:
     buffer_size: Optional[int]
         The size of the buffer to use to shuffle the batches. If None, the buffer
         will be approximately the size of the dataset.
+    repeat: Optional[int]
+        How many time will the sampler iterate over the dataset. If None,
+        iterates indefinitely.
     """
 
     def __init__(
@@ -151,8 +153,9 @@ def __init__(
         batch_size: int,
         batch_unit: str,
         noise=1,
-        drop_last=True,
+        drop_last=False,
         buffer_size: Optional[int] = None,
+        repeat: Optional[int] = None,
     ):
         self.dataset = dataset
         self.batch_size = batch_size
@@ -160,6 +163,10 @@ def __init__(
         self.noise = noise
         self.drop_last = drop_last
         self.buffer_size = buffer_size
+        self.repeat = repeat
+
+    def set_repeat(self, repeat):
+        self.repeat = repeat
 
     def __iter__(self):
         # Shuffle the dataset
@@ -181,6 +188,7 @@ def sample_len(idx, noise=True):
         elif self.batch_unit == "spans":
 
             def sample_len(idx, noise=True):
+                # TODO: implement noise here ?
                 return len(
                     next(
                         v for k, v in self.dataset[idx].items() if k.endswith("begins")
@@ -193,6 +201,7 @@ def sample_len(idx, noise=True):
         def make_batches():
             total = 0
             batch = []
+            n_iter = 0
             for seq_size, idx in sorted_sequences:
                 if total and total + seq_size > self.batch_size:
                     yield batch
@@ -200,6 +209,8 @@ def make_batches():
                     batch = []
                 total += seq_size
                 batch.append(idx)
+            if not self.drop_last:
+                yield batch
 
         # Shuffle the batches in buffer that contain approximately
         # the full dataset to add more randomness
@@ -214,12 +225,12 @@ def make_batches():
         # Sort sequences by length +- some noise
         sorted_sequences = chain.from_iterable(
             sorted((sample_len(i), i) for i in range(len(self.dataset)))
-            for _ in repeat(None)
+            for _ in repeat(None, times=self.repeat)
         )
 
         # Batch sorted sequences
         batches = make_batches()
-        buffers = batchify(batches, buffer_size)
+        buffers = batchify(batches, buffer_size, drop_last=self.drop_last)
         for buffer in buffers:
             random.shuffle(buffer)
             yield from buffer
@@ -243,8 +254,13 @@ def __init__(self, nlp, embedding, grad_accumulation_max_tokens):
         self.nlp = nlp
         self.embedding: Transformer = embedding
         self.grad_accumulation_max_tokens = grad_accumulation_max_tokens
+        self.i = 0
 
     def __call__(self, seq):
+        mini_batches = self.get_mini_batches(seq)
+        return [self.nlp.collate(b) for b in mini_batches]
+
+    def get_mini_batches(self, seq):
         total = 0
         mini_batches = [[]]
         for sample_features in seq:
@@ -263,7 +279,7 @@ def __call__(self, seq):
                 mini_batches.append([])
             total += num_tokens
             mini_batches[-1].append(sample_features)
-        return [self.nlp.collate(b) for b in mini_batches]
+        return mini_batches
 
 
 def subset_doc(doc: Doc, start: int, end: int) -> Doc:
@@ -483,6 +499,7 @@ def train(
     seed: int = 42,
     data_seed: int = 42,
     max_steps: int = 1000,
+    max_epochs: int | None = None,
     batch_size: BatchSizeArg = 2000,
     transformer_lr: float = 5e-5,
     task_lr: float = 3e-4,
@@ -499,7 +516,9 @@ def train(
         for module_name, module in pipe.named_component_modules()
         if isinstance(module, Transformer)
     )
-
+    assert not (max_steps and max_epochs), "Use only steps or epochs"
+    if max_epochs:
+        max_steps = int(0.9 * (4464 / batch_size[0]))
     set_seed(seed)
     # Loading and adapting the training and validation data
     with set_seed(data_seed):
@@ -519,6 +538,30 @@ def train(
             show_progress=True
         )
     )
+
+    batch_sampler = LengthSortedBatchSampler(
+        preprocessed,
+        batch_size=batch_size[0],
+        batch_unit=batch_size[1],
+        repeat=max_epochs,
+    )
+    collate_fn = SubBatchCollater(
+        nlp,
+        trf_pipe,
+        grad_accumulation_max_tokens=grad_accumulation_max_tokens,
+    )
+
+    if max_epochs is not None:
+        # we have to make a dry run
+        batch_sampler.set_repeat(repeat=1)  # single epoch
+        for batch in batch_sampler:
+            batch_collated = collate_fn.get_mini_batches(batch)
+            n_true_steps = len(batch_collated)
+            print(f"True number of steps: {n_true_steps}")
+            max_steps = max_epochs * n_true_steps
+            # TODO show mean batch size ?
+        batch_sampler.set_repeat(repeat=repeat)
+        
     dataloader = torch.utils.data.DataLoader(
         preprocessed,
         batch_sampler=LengthSortedBatchSampler(
@@ -531,7 +574,9 @@ def train(
             trf_pipe,
             grad_accumulation_max_tokens=grad_accumulation_max_tokens,
         ),
+        shuffle=False,
     )
+    
     pipe_names, trained_pipes = zip(*nlp.torch_components())
     print("Training", ", ".join(pipe_names))
 
@@ -584,11 +629,16 @@ def train(
 
     cumulated_data = defaultdict(lambda: 0.0, count=0)
 
-    iterator = itertools.chain.from_iterable(itertools.repeat(dataloader))
+    # TODO: maybe back to: itertools.chain.from_iterable(itertools.repeat(dataloader))
+    iterator = iter(dataloader)
     all_metrics = []
     nlp.train(True)
     set_seed(seed)
 
+    n_seen_samples = 0
+    epoch = 0
+    true_batches = []
+
     with RichTablePrinter(LOGGER_FIELDS, auto_refresh=False) as logger:
         with tqdm(
             range(max_steps + 1),
@@ -597,6 +647,9 @@ def train(
             mininterval=5.0,
         ) as bar:
             for step in bar:
+                if max_epochs and (epoch > max_epochs):
+                    print(f"Done, left steps: {max_steps - step}")
+                    break
                 if (step % validation_interval) == 0:
                     scores = scorer(nlp, val_docs)
                     all_metrics.append(
@@ -614,13 +667,24 @@ def train(
                     )
                     logger.log_metrics(flatten_dict(all_metrics[-1]))
                 if step == max_steps:
+                    print(f"Done, epoch {epoch}")
                     break
                 mini_batches = next(iterator)
                 optimizer.zero_grad()
                 for mini_batch in mini_batches:
+                    true_batches.append(
+                        mini_batch["ecci_qualifier"]["targets"].shape[0]
+                    )
+                    seen = False
                     loss = torch.zeros((), device=accelerator.device)
                     with nlp.cache():
                         for name, pipe in zip(pipe_names, trained_pipes):
+                            if not seen:
+                                n_seen_samples += mini_batch["ecci_qualifier"][
+                                    "targets"
+                                ].shape[0]
+                                epoch = 1 + (n_seen_samples / 4464)
+                            # print(f"{step} - Seen:{n_seen_samples} - Epoch:{epoch}")
                             output = pipe(mini_batch[name])
                             if "loss" in output:
                                 loss += output["loss"]
@@ -634,6 +698,9 @@ def train(
 
                 torch.nn.utils.clip_grad_norm_(grad_params, max_grad_norm)
                 optimizer.step()
+    
+    print(init_batches)
+    print(sorted(true_batches))
 
     return nlp
 
diff --git a/edsnlp/utils/doc_to_text.py b/edsnlp/utils/doc_to_text.py
index b9ea7043e..2a525e94a 100644
--- a/edsnlp/utils/doc_to_text.py
+++ b/edsnlp/utils/doc_to_text.py
@@ -82,8 +82,8 @@ def aggregate_tokens(
         else:
             keep_list = [True] * len(arr)
 
-        for i, (str_hash, space, keep) in enumerate(
-            zip(tokens_text, tokens_space, keep_list)
+        for i, (str_hash, tag_hash, space, keep) in enumerate(
+            zip(tokens_text, tokens_tag, tokens_space, keep_list)
         ):
             if keep:
                 if space:
@@ -99,6 +99,12 @@ def aggregate_tokens(
                     offset += len(part)
                     ends[i] = offset
             else:
+                if i > 0 and tag_hash == space_hash:
+                    if text_parts[i - 1][-1:] and (
+                        text_parts[i - 1][-1:] not in (" ", "\n")
+                    ):
+                        text_parts[i - 1] += " "
+                        offset += 1
                 begins[i] = offset
                 ends[i] = offset
 
diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py
index ce07acc61..dc5b8330f 100644
--- a/edsnlp/utils/span_getters.py
+++ b/edsnlp/utils/span_getters.py
@@ -42,11 +42,14 @@ def get_spans(doc, span_getter):
     if callable(span_getter):
         yield from span_getter(doc)
         return
+    seen = set()
     for key, span_filter in span_getter.items():
         if key == "*":
             candidates = (span for group in doc.spans.values() for span in group)
         else:
             candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents
+        candidates = [candidate for candidate in candidates if hash(candidate) not in seen]
+        seen |= set(hash(candidate) for candidate in candidates)
         if span_filter is True:
             yield from candidates
         else:
diff --git a/tests/pipelines/core/test_normalisation.py b/tests/pipelines/core/test_normalisation.py
index 4cfa7b039..3628145bb 100644
--- a/tests/pipelines/core/test_normalisation.py
+++ b/tests/pipelines/core/test_normalisation.py
@@ -1,3 +1,4 @@
+import spacy
 from pytest import fixture
 
 from edsnlp.matchers.utils import get_text
@@ -25,7 +26,6 @@ def test_full_normalization(doc):
 @fixture
 def nlp_factory(blank_nlp):
     def f(a=False, lc=False, q=False, p=False):
-
         if a:
             a = dict(accents=accents)
         if q:
@@ -48,7 +48,6 @@ def f(a=False, lc=False, q=False, p=False):
 
 
 def test_normalization_accents(nlp_factory, text):
-
     nlp = nlp_factory(a=True)
     doc = nlp(text)
 
@@ -58,7 +57,6 @@ def test_normalization_accents(nlp_factory, text):
 
 
 def test_normalization_spaces(nlp_factory, text):
-
     nlp = nlp_factory(a=True)
     doc = nlp("Phrase    avec des espaces \n et un retour à la ligne")
 
@@ -67,7 +65,6 @@ def test_normalization_spaces(nlp_factory, text):
 
 
 def test_normalization_quotes(nlp_factory, text):
-
     nlp = nlp_factory(q=True)
     doc = nlp(text)
 
@@ -79,7 +76,6 @@ def test_normalization_quotes(nlp_factory, text):
 
 
 def test_normalization_lowercase(nlp_factory, text):
-
     nlp = nlp_factory(lc=True)
     doc = nlp(text)
 
@@ -88,8 +84,24 @@ def test_normalization_lowercase(nlp_factory, text):
     assert norm.startswith("l'aïeul")
 
 
-def test_normalization_pollution(nlp_factory, text):
+def test_normalization_pollution_with_eds_lang():
+    nlp = spacy.blank("eds")
+    nlp.add_pipe("eds.normalizer")
+    text = "Il faut soigner ce diab-\nete"
+    doc = nlp(text)
+    norm = get_text(doc, attr="NORM", ignore_excluded=True)
+    assert norm == "il faut soigner ce diabete"
+
+
+def test_normalization_linebreak_no_space(nlp_factory):
+    nlp = nlp_factory()
+    text = "Mode de vie: \nTabac\nAlcool\nPas de sport"
+    doc = nlp(text)
+    norm = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True)
+    assert norm == "Mode de vie: Tabac Alcool Pas de sport"
+
 
+def test_normalization_pollution(nlp_factory, text):
     nlp = nlp_factory(p=True)
     doc = nlp(text)
 
diff --git a/tests/pipelines/ner/disorders/alcohol.py b/tests/pipelines/ner/disorders/alcohol.py
index 19f261f5b..f9fb65234 100644
--- a/tests/pipelines/ner/disorders/alcohol.py
+++ b/tests/pipelines/ner/disorders/alcohol.py
@@ -9,6 +9,10 @@
         True,
         True,
         True,
+        True,
+        True,
+        True,
+        False,
     ],
     detailled_status=[
         None,
@@ -20,6 +24,10 @@
         None,
         "ABSTINENCE",
         None,
+        None,
+        "ABSTINENCE",
+        None,
+        None,
     ],
     negation=[
         None,
@@ -27,10 +35,14 @@
         None,
         None,
         None,
-        None,
+        False,
         True,
         None,
         True,
+        False,
+        False,
+        False,
+        None,
     ],
     assign=None,
     texts=[
@@ -38,10 +50,14 @@
         "OH chronique.",
         "Prise d'alcool occasionnelle",
         "Application d'un pansement alcoolisé",
-        "Alcoolisme sevré",
-        "Alcoolisme non sevré",
+        "Présence d'un alcoolisme sevré",
+        "Présence d'un alcoolisme non sevré",
         "Alcool: 0",
         "Le patient est en cours de sevrage éthylotabagique",
         "Patient alcoolique: non.",
+        "On a un alcoolique non sevré depuis 10 ans.",
+        "Alcoolisme sevré",
+        "Alcoolisme non sevré",
+        "Dosage vitamines 25-OH",
     ],
 )
diff --git a/tests/pipelines/ner/disorders/test_all.py b/tests/pipelines/ner/disorders/test_all.py
index 7eca71125..8f37bc6e9 100644
--- a/tests/pipelines/ner/disorders/test_all.py
+++ b/tests/pipelines/ner/disorders/test_all.py
@@ -99,7 +99,10 @@ def check(self):
             for ent in ents:
                 assert ent.label_ == self.disorder
                 if negation is not None:
-                    assert ent._.negation == negation
+                    if negation:
+                        assert ent._.negation == negation
+                    else:
+                        assert ent._.negation is None
 
             if not ents:
                 continue
diff --git a/tests/pipelines/ner/disorders/tobacco.py b/tests/pipelines/ner/disorders/tobacco.py
index 4ea3fa934..e46b000e9 100644
--- a/tests/pipelines/ner/disorders/tobacco.py
+++ b/tests/pipelines/ner/disorders/tobacco.py
@@ -9,6 +9,7 @@
         True,
         True,
         True,
+        True,
     ],
     detailled_status=[
         None,
@@ -20,6 +21,7 @@
         "ABSTINENCE",
         None,
         None,
+        None,
     ],
     negation=[
         None,
@@ -31,6 +33,7 @@
         None,
         True,
         True,
+        False,
     ],
     assign=[{"PA": 15}] + 8 * [None],
     texts=[
@@ -38,10 +41,11 @@
         "Patient tabagique",
         "Tabagisme festif",
         "On a un tabagisme ancien",
-        "Tabac: 0",
-        "Tabagisme passif",
-        "Tabac: sevré depuis 5 ans",
+        "Pour le tabac: 0",
+        "Notion de tabagisme passif",
+        "Concernant le tabac: sevré depuis 5 ans",
         "Le patient ne fume aucun truc.",
         "Le patient fume 0 PA.",
+        "On a un tabagique non sevré depuis 10 ans.",
     ],
 )
diff --git a/tests/test_language.py b/tests/test_language.py
index 369c6df12..363a7f117 100644
--- a/tests/test_language.py
+++ b/tests/test_language.py
@@ -58,6 +58,18 @@ def test_eds_tokenizer_whitespace():
     ]
 
 
+def test_eds_tokenizer_intraword_split():
+    nlp = spacy.blank("eds")
+    tokenized = [(w.text, w.whitespace_) for w in nlp("Un dia-\nbete ici")]
+    assert tokenized == [
+        ("Un", " "),
+        ("dia", ""),
+        ("-\n", ""),
+        ("bete", " "),
+        ("ici", ""),
+    ]
+
+
 def test_eds_tokenizer_numbers():
     nlp = spacy.blank("eds")
     tokenized = [(w.text, w.whitespace_) for w in nlp("Il fait 5.3/5.4mm")]