From 808c392c0be4287f1fee1db21dc7128357042583 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 09:15:17 +0200 Subject: [PATCH 01/12] ci: use ubuntu-22 instead of latest to keep python37 compatibility --- .github/workflows/documentation.yml | 2 +- .github/workflows/release.yml | 8 ++++---- .github/workflows/test-build.yml | 4 ++-- .github/workflows/tests.yml | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 06aac1ac1..d182c0ae0 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -13,7 +13,7 @@ env: jobs: Documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 65de04d5a..b6b9198d0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-22.04, windows-latest, macos-latest] steps: - uses: actions/checkout@v4 @@ -42,7 +42,7 @@ jobs: build_sdist: name: Build source distribution - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -58,7 +58,7 @@ jobs: name: Upload to PyPI needs: [build_wheels, build_sdist] - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/download-artifact@v4 @@ -76,7 +76,7 @@ jobs: # repository_url: https://test.pypi.org/legacy/ Documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 0b849728b..ac64a28b3 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -17,7 +17,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-22.04, windows-latest, macos-latest] steps: - uses: actions/checkout@v2 @@ -30,7 +30,7 @@ jobs: build_sdist: name: Build source distribution - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 728547434..f55139d32 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: linting: name: Linting if: github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 with: @@ -32,7 +32,7 @@ jobs: pytest: name: Pytest - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: fail-fast: true matrix: @@ -120,7 +120,7 @@ jobs: documentation: name: Documentation - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -150,7 +150,7 @@ jobs: simple-installation: name: Simple installation - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: fail-fast: true matrix: From 543d5df5dbf1c3d3b44da53ce1fcea4e9bc6a9ca Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 09:16:45 +0200 Subject: [PATCH 02/12] feat: handle linebreak inside words and linebreak without leading whitespace --- edsnlp/language.py | 10 +++++++- .../core/normalizer/pollution/patterns.py | 5 ++++ edsnlp/utils/doc_to_text.py | 10 ++++++-- tests/pipelines/core/test_normalisation.py | 24 ++++++++++++++----- tests/test_language.py | 12 ++++++++++ 5 files changed, 52 insertions(+), 9 deletions(-) diff --git a/edsnlp/language.py b/edsnlp/language.py index f61c804b3..8a6df7e9c 100644 --- a/edsnlp/language.py +++ b/edsnlp/language.py @@ -41,7 +41,15 @@ class EDSLanguage(French): Defaults = EDSDefaults -TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))"] +TOKENIZER_EXCEPTIONS = [ + r"Dr\.", + r"Pr\.", + r"M\.", + r"Mme\.", + r"Mlle\.", + r"(?i:(?:ep\.))", + r"-\n", +] class EDSTokenizer(Tokenizer): diff --git a/edsnlp/pipes/core/normalizer/pollution/patterns.py b/edsnlp/pipes/core/normalizer/pollution/patterns.py index ad5ae5100..915a81131 100644 --- a/edsnlp/pipes/core/normalizer/pollution/patterns.py +++ b/edsnlp/pipes/core/normalizer/pollution/patterns.py @@ -40,6 +40,9 @@ footer = rf"(?i)({page}.*\n?pat.*(ipp)?.*\n?(courrier valid.*)?)" footer += rf"|(.*{date}.*{ipp}.*)|(imprim.\sle\s{date}.*\d/\d.*\n?pat.*{date})" +# Word split in the middle due to line break +intraword_split = r"-\n" + pollution = dict( information=information, bars=bars, @@ -48,6 +51,7 @@ web=web, coding=coding, footer=footer, + intraword_split=intraword_split, ) default_enabled = dict( @@ -58,4 +62,5 @@ web=True, coding=False, footer=True, + intraword_split=True, ) diff --git a/edsnlp/utils/doc_to_text.py b/edsnlp/utils/doc_to_text.py index b9ea7043e..2a525e94a 100644 --- a/edsnlp/utils/doc_to_text.py +++ b/edsnlp/utils/doc_to_text.py @@ -82,8 +82,8 @@ def aggregate_tokens( else: keep_list = [True] * len(arr) - for i, (str_hash, space, keep) in enumerate( - zip(tokens_text, tokens_space, keep_list) + for i, (str_hash, tag_hash, space, keep) in enumerate( + zip(tokens_text, tokens_tag, tokens_space, keep_list) ): if keep: if space: @@ -99,6 +99,12 @@ def aggregate_tokens( offset += len(part) ends[i] = offset else: + if i > 0 and tag_hash == space_hash: + if text_parts[i - 1][-1:] and ( + text_parts[i - 1][-1:] not in (" ", "\n") + ): + text_parts[i - 1] += " " + offset += 1 begins[i] = offset ends[i] = offset diff --git a/tests/pipelines/core/test_normalisation.py b/tests/pipelines/core/test_normalisation.py index 4cfa7b039..3628145bb 100644 --- a/tests/pipelines/core/test_normalisation.py +++ b/tests/pipelines/core/test_normalisation.py @@ -1,3 +1,4 @@ +import spacy from pytest import fixture from edsnlp.matchers.utils import get_text @@ -25,7 +26,6 @@ def test_full_normalization(doc): @fixture def nlp_factory(blank_nlp): def f(a=False, lc=False, q=False, p=False): - if a: a = dict(accents=accents) if q: @@ -48,7 +48,6 @@ def f(a=False, lc=False, q=False, p=False): def test_normalization_accents(nlp_factory, text): - nlp = nlp_factory(a=True) doc = nlp(text) @@ -58,7 +57,6 @@ def test_normalization_accents(nlp_factory, text): def test_normalization_spaces(nlp_factory, text): - nlp = nlp_factory(a=True) doc = nlp("Phrase avec des espaces \n et un retour à la ligne") @@ -67,7 +65,6 @@ def test_normalization_spaces(nlp_factory, text): def test_normalization_quotes(nlp_factory, text): - nlp = nlp_factory(q=True) doc = nlp(text) @@ -79,7 +76,6 @@ def test_normalization_quotes(nlp_factory, text): def test_normalization_lowercase(nlp_factory, text): - nlp = nlp_factory(lc=True) doc = nlp(text) @@ -88,8 +84,24 @@ def test_normalization_lowercase(nlp_factory, text): assert norm.startswith("l'aïeul") -def test_normalization_pollution(nlp_factory, text): +def test_normalization_pollution_with_eds_lang(): + nlp = spacy.blank("eds") + nlp.add_pipe("eds.normalizer") + text = "Il faut soigner ce diab-\nete" + doc = nlp(text) + norm = get_text(doc, attr="NORM", ignore_excluded=True) + assert norm == "il faut soigner ce diabete" + + +def test_normalization_linebreak_no_space(nlp_factory): + nlp = nlp_factory() + text = "Mode de vie: \nTabac\nAlcool\nPas de sport" + doc = nlp(text) + norm = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True) + assert norm == "Mode de vie: Tabac Alcool Pas de sport" + +def test_normalization_pollution(nlp_factory, text): nlp = nlp_factory(p=True) doc = nlp(text) diff --git a/tests/test_language.py b/tests/test_language.py index 369c6df12..363a7f117 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -58,6 +58,18 @@ def test_eds_tokenizer_whitespace(): ] +def test_eds_tokenizer_intraword_split(): + nlp = spacy.blank("eds") + tokenized = [(w.text, w.whitespace_) for w in nlp("Un dia-\nbete ici")] + assert tokenized == [ + ("Un", " "), + ("dia", ""), + ("-\n", ""), + ("bete", " "), + ("ici", ""), + ] + + def test_eds_tokenizer_numbers(): nlp = spacy.blank("eds") tokenized = [(w.text, w.whitespace_) for w in nlp("Il fait 5.3/5.4mm")] From 698a8dfe7bd113f30d39434bd73a4641117f1a33 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 09:17:19 +0200 Subject: [PATCH 03/12] fix: handle case where status is None in behavior/disorder pipes --- edsnlp/pipes/ner/disorders/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/edsnlp/pipes/ner/disorders/base.py b/edsnlp/pipes/ner/disorders/base.py index 06d25358d..54855dbea 100644 --- a/edsnlp/pipes/ner/disorders/base.py +++ b/edsnlp/pipes/ner/disorders/base.py @@ -113,8 +113,17 @@ def __call__(self, doc: Doc) -> Doc: annotated spaCy Doc object """ spans = list(self.process(doc)) + all_detailed_status = set(self.detailed_status_mapping.keys()) for span in spans: - span._.detailed_status = self.detailed_status_mapping[span._.status] + if span._.status is not None and span._.status not in all_detailed_status: + raise ValueError( + f"Got incorrect status value for '{span}'. Expected " + f"None or one of {all_detailed_status}, got {span._.status}" + ) + span._.detailed_status = self.detailed_status_mapping.get( + span._.status, + None, + ) self.set_spans(doc, filter_spans(spans)) From 827f4bef31de9f9e8660f55fd6c1700b3cb5a833 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 09:18:17 +0200 Subject: [PATCH 04/12] fix: treat span as doc in Qualifier process method --- edsnlp/pipes/qualifiers/base.py | 8 ++++++-- edsnlp/pipes/qualifiers/family/family.py | 3 ++- edsnlp/pipes/qualifiers/history/history.py | 3 ++- edsnlp/pipes/qualifiers/hypothesis/hypothesis.py | 3 ++- edsnlp/pipes/qualifiers/negation/negation.py | 3 ++- .../pipes/qualifiers/reported_speech/reported_speech.py | 3 ++- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/edsnlp/pipes/qualifiers/base.py b/edsnlp/pipes/qualifiers/base.py index 521baf0e6..0b41fac9e 100644 --- a/edsnlp/pipes/qualifiers/base.py +++ b/edsnlp/pipes/qualifiers/base.py @@ -172,9 +172,13 @@ def get_matches(self, doc: Doc) -> List[Span]: return list(matches) - def process(self, doc: Doc) -> BaseQualifierResults: + def ensure_doc(self, doc: Union[Doc, Span]) -> Doc: + return doc if not hasattr(doc, "as_doc") else doc.as_doc() + + def process(self, doc_like: Union[Doc, Span]) -> BaseQualifierResults: + doc_like = self.ensure_doc(doc_like) # pragma: no cover raise NotImplementedError def __call__(self, doc: Doc) -> Doc: - results = self.process(doc) + results = self.process(doc) # pragma: no cover raise NotImplementedError(f"{type(results)} should be used to tag the document") diff --git a/edsnlp/pipes/qualifiers/family/family.py b/edsnlp/pipes/qualifiers/family/family.py index e979e71de..12943d9eb 100644 --- a/edsnlp/pipes/qualifiers/family/family.py +++ b/edsnlp/pipes/qualifiers/family/family.py @@ -187,7 +187,8 @@ def set_extensions(self) -> None: if not Doc.has_extension("family"): Doc.set_extension("family", default=[]) - def process(self, doc: Doc) -> FamilyResults: + def process(self, doc_like: Union[Doc, Span]) -> FamilyResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) terminations = [m for m in matches if m.label_ == "termination"] diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py index 2dc8d56d8..ab5469c95 100644 --- a/edsnlp/pipes/qualifiers/history/history.py +++ b/edsnlp/pipes/qualifiers/history/history.py @@ -326,7 +326,8 @@ def set_extensions(self) -> None: getter=deprecated_getter_factory("antecedent_cues", "history_cues"), ) - def process(self, doc: Doc) -> HistoryResults: + def process(self, doc_like: Union[Doc, Span]) -> HistoryResults: + doc = self.ensure_doc(doc_like) note_datetime = None if doc._.note_datetime is not None: try: diff --git a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py index 924d2cf63..68ed1d1d1 100644 --- a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py +++ b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py @@ -262,7 +262,8 @@ def load_verbs( list_hypo_verbs_following, ) - def process(self, doc: Doc) -> HypothesisResults: + def process(self, doc_like: Union[Doc, Span]) -> HypothesisResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) terminations = [m for m in matches if m.label_ == "termination"] diff --git a/edsnlp/pipes/qualifiers/negation/negation.py b/edsnlp/pipes/qualifiers/negation/negation.py index fb2c7878f..ea4a4fc40 100644 --- a/edsnlp/pipes/qualifiers/negation/negation.py +++ b/edsnlp/pipes/qualifiers/negation/negation.py @@ -295,7 +295,8 @@ def __call__(self, doc: Doc) -> Doc: token._.negation = True return doc - def process(self, doc: Doc) -> NegationResults: + def process(self, doc_like: Union[Doc, Span]) -> NegationResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) terminations = [m for m in matches if m.label_ == "termination"] diff --git a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py index 77b0cbe91..759eb7091 100644 --- a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py +++ b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py @@ -226,7 +226,8 @@ def load_verbs(self, verbs: List[str]) -> List[str]: return list_rep_verbs - def process(self, doc: Doc) -> ReportedSpeechResults: + def process(self, doc_like: Union[Doc, Span]) -> ReportedSpeechResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) matches += list(self.regex_matcher(doc, as_spans=True)) From 5d790d21ccbd236eee0f0653a0d633fd045ccc43 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 09:19:10 +0200 Subject: [PATCH 05/12] fix: small bug in alcohol and tobacco pipes --- edsnlp/pipes/ner/behaviors/alcohol/alcohol.py | 6 ++--- .../pipes/ner/behaviors/alcohol/patterns.py | 2 +- tests/pipelines/ner/disorders/alcohol.py | 22 ++++++++++++++++--- tests/pipelines/ner/disorders/test_all.py | 5 ++++- tests/pipelines/ner/disorders/tobacco.py | 10 ++++++--- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py index ca543fa6e..c1bbc9177 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py @@ -118,8 +118,8 @@ def process(self, doc: Doc) -> List[Span]: stopped = self.negation.process(span) if not any(stopped_token.negation for stopped_token in stopped.tokens): span._.status = 2 - - if "zero_after" in span._.assigned.keys(): - span._.negation = True + else: + if "zero_after" in span._.assigned.keys(): + span._.negation = True yield span diff --git a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py index 38c795926..7777225d3 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py @@ -3,7 +3,7 @@ regex=[ r"\balco[ol]", r"\bethyl", - r"(? Date: Wed, 16 Oct 2024 09:46:56 +0200 Subject: [PATCH 06/12] docs: add details for disorders and behavior pipes --- docs/pipes/ner/behaviors/alcohol.md | 2 + docs/pipes/ner/behaviors/index.md | 97 +----------------------- docs/pipes/ner/disorders/index.md | 56 +------------- docs/pipes/ner/disorders/presentation.md | 77 +++++++++++++++++++ docs/pipes/ner/disorders/warning.md | 7 ++ 5 files changed, 90 insertions(+), 149 deletions(-) create mode 100644 docs/pipes/ner/disorders/presentation.md create mode 100644 docs/pipes/ner/disorders/warning.md diff --git a/docs/pipes/ner/behaviors/alcohol.md b/docs/pipes/ner/behaviors/alcohol.md index f8c5772fb..be987c188 100644 --- a/docs/pipes/ner/behaviors/alcohol.md +++ b/docs/pipes/ner/behaviors/alcohol.md @@ -1,5 +1,7 @@ # Alcohol consumption {: #edsnlp.pipes.ner.behaviors.alcohol.factory.create_component } +--8<-- "docs/pipes/ner/disorders/warning.md" + ::: edsnlp.pipes.ner.behaviors.alcohol.factory.create_component options: heading_level: 2 diff --git a/docs/pipes/ner/behaviors/index.md b/docs/pipes/ner/behaviors/index.md index 8544c0255..aac46ed71 100644 --- a/docs/pipes/ner/behaviors/index.md +++ b/docs/pipes/ner/behaviors/index.md @@ -2,99 +2,6 @@ ## Presentation -EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the ContextualMatcher component. -Some general considerations about those components: +EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component. -- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`. -- The matched comorbidity is also available under the `ent.label_` of each match. -- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details. -- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute. -- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters: - ```{ .python .no-check } - nlp.add_pipe( - eds.normalizer( - accents=True, - lowercase=True, - quotes=True, - spaces=True, - pollution=dict( - information=True, - bars=True, - biology=True, - doctors=True, - web=True, - coding=True, - footer=True, - ), - ), - ) - ``` - -!!! warning "Use qualifiers" - Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet. - - !!! aphp "Use the ML model" - - The model will soon be available in the models catalogue of AP-HP's CDW. - -## Usage - -```{ .python .no-check } -import edsnlp, edsnlp.pipes as eds - -nlp = edsnlp.blank("eds") -nlp.add_pipe(eds.sentences()) -nlp.add_pipe( - eds.normalizer( - accents=True, - lowercase=True, - quotes=True, - spaces=True, - pollution=dict( - information=True, - bars=True, - biology=True, - doctors=True, - web=True, - coding=True, - footer=True, - ), - ), -) -nlp.add_pipe(eds.tobacco()) -nlp.add_pipe(eds.diabetes()) - -text = """ -Compte-rendu de consultation. - -Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique. -Le patient va bien depuis la dernière fois. -Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année). - -Sur le plan de son diabète, la glycémie est stable. -""" - -doc = nlp(text) - -doc.spans -# Out: { -# 'pollutions': [], -# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année], -# 'diabetes': [rétinopathie diabétique, diabète] -# } - -tobacco_matches = doc.spans["tobacco"] -tobacco_matches[0]._.detailed_status -# Out: "ABSTINENCE" # - -tobacco_matches[0]._.assigned["PA"] # paquet-année -# Out: 10 # (1) - - -diabetes = doc.spans["diabetes"] -(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status) -# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (2) -``` - -1. Here we see an example of additional information that can be extracted -2. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity. +--8<-- "docs/pipes/ner/disorders/presentation.md" diff --git a/docs/pipes/ner/disorders/index.md b/docs/pipes/ner/disorders/index.md index e261fcd98..ad0321b0b 100644 --- a/docs/pipes/ner/disorders/index.md +++ b/docs/pipes/ner/disorders/index.md @@ -2,58 +2,6 @@ ## Presentation -The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the ContextualMatcher component. +The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component. -The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024] - -Some general considerations about those components: - -- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`. -- The matched comorbidity is also available under the `ent.label_` of each match. -- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details. -- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute. -- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters: - - ```{ .python .no-check } - import edsnlp, edsnlp.pipes as eds - ... - - nlp.add_pipe( - eds.normalizer( - accents=True, - lowercase=True, - quotes=True, - spaces=True, - pollution=dict( - information=True, - bars=True, - biology=True, - doctors=True, - web=True, - coding=True, - footer=True, - ), - ), - ) - ``` - -!!! warning "Use qualifiers" - Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet. - - !!! aphp "Use the ML model" - - The model will soon be available in the models catalogue of AP-HP's CDW. - -!!! tip "On the medical definition of the comorbidities" - - Those components were developped to extract **chronic** and **symptomatic** conditions only. - -## Aggregation - -For relevant phenotyping, matches should be aggregated at the document-level. For instance, a document might mention a complicated diabetes at the beginning ("*Le patient a une rétinopathie diabétique*"), and then refer to this diabetes without mentionning that it is complicated anymore ("*Concernant son diabète, le patient ...*"). -Thus, a good and simple aggregation rule is, for each comorbidity, to - -- disregard all entities tagged as irrelevant by the qualification component(s) -- take the maximum (i.e., the most severe) status of the leftover entities - -An implementation of this rule is presented [here][aggregating-results] +--8<-- "docs/pipes/ner/disorders/presentation.md" diff --git a/docs/pipes/ner/disorders/presentation.md b/docs/pipes/ner/disorders/presentation.md new file mode 100644 index 000000000..1918867a8 --- /dev/null +++ b/docs/pipes/ner/disorders/presentation.md @@ -0,0 +1,77 @@ +The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024] + +Some general considerations about those components: + +- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`. +- The matched comorbidity is also available under the `ent.label_` of each match. +- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details. +- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute. +- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline (see [Usage](#usage) below) + +--8<-- "docs/pipes/ner/disorders/warning.md" + +!!! warning "Use qualifiers" + Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you should use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet. + + !!! aphp "Use the ML model" + + For projects working on AP-HP's CDW, this model is available via its models catalogue. + +## Usage + +```{ .python .no-check } +import edsnlp, edsnlp.pipes as eds + +nlp = edsnlp.blank("eds") +nlp.add_pipe(eds.sentences()) +nlp.add_pipe( + eds.normalizer( + accents=True, + lowercase=True, + quotes=True, + spaces=True, + pollution=dict( + biology=True, #(1) + coding=True, #(2) + ), + ), +) +nlp.add_pipe(eds.tobacco()) +nlp.add_pipe(eds.diabetes()) + +text = """ +Compte-rendu de consultation. + +Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique. +Le patient va bien depuis la dernière fois. +Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année). + +Sur le plan de son diabète, la glycémie est stable. +""" + +doc = nlp(text) + +doc.spans +# Out: { +# 'pollutions': [], +# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année], +# 'diabetes': [rétinopathie diabétique, diabète] +# } + +tobacco_matches = doc.spans["tobacco"] +tobacco_matches[0]._.detailed_status +# Out: "ABSTINENCE" # + +tobacco_matches[0]._.assigned["PA"] # paquet-année +# Out: 10 # (3) + + +diabetes = doc.spans["diabetes"] +(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status) +# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (4) +``` + +1. This will discard mentions of biology results, which often leads to false positive +2. This will discard mentions of ICD10 coding that sometimes appears at the end of clinical documents +3. Here we see an example of additional information that can be extracted +4. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity. diff --git a/docs/pipes/ner/disorders/warning.md b/docs/pipes/ner/disorders/warning.md new file mode 100644 index 000000000..6268cf664 --- /dev/null +++ b/docs/pipes/ner/disorders/warning.md @@ -0,0 +1,7 @@ +!!! danger "On overlapping entities" + When using multiple disorders or behavior pipelines, some entities may be extracted from different pipes. For instance: + + * "Intoxication éthylotabagique" will be tagged both by `eds.tobacco` and `eds.alcohol` + * "Chirrose alcoolique" will be tagged both by `eds.liver_disease` and `eds.alcohol` + + As `doc.ents` discards overlapping entities, you should use `doc.spans` instead. From 51d5d714d02b01e36c05b9f4c2f12e2f232a707c Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 09:50:47 +0200 Subject: [PATCH 07/12] chore: update changelog --- changelog.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/changelog.md b/changelog.md index c88ce1d74..1d1ae7146 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,19 @@ # Changelog +## Unreleased + +### Added + +- `EDS.Tokenizer` now handles `-\n` (found in text when spliting a long word with a linebreak) as a specific token, which can be discarded by the normalizer pipe. + +### Fixed + +- Use `ubuntu-22` instead of `ubuntu-latest` in CI to keep `python 3.7` compatibility +- When using `ignore_space_tokens=True`, words separated only by linebreaks will be collected (via `get_text()`) with spaces inbetween +- The `process` method of `Qualifiers` now accepts `Span` as input, an treats it as a `Doc` to avoid alignment issues +- The `detailed_status_mapping` of disorder/behavior pipes not handles the previous `KeyError: None` that can occur when loading pre-annotated docs without instanciating pipes beforehands +- Various fixes on the Alcohol and Tobacco pipes + ## v0.13.1 ### Added From 336c46354b6ec0cce066832f14d1b8a935b4b9e5 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Wed, 16 Oct 2024 11:14:34 +0200 Subject: [PATCH 08/12] fix: update pattern for intraword linebreak --- edsnlp/language.py | 2 +- edsnlp/pipes/core/normalizer/pollution/patterns.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/edsnlp/language.py b/edsnlp/language.py index 8a6df7e9c..01ccae292 100644 --- a/edsnlp/language.py +++ b/edsnlp/language.py @@ -48,7 +48,7 @@ class EDSLanguage(French): r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))", - r"-\n", + r"(? Date: Fri, 25 Oct 2024 08:45:05 +0000 Subject: [PATCH 09/12] various changes --- edsnlp/core/pipeline.py | 8 +++++++- edsnlp/pipes/trainable/span_classifier/span_classifier.py | 6 ++++++ edsnlp/train.py | 6 ++++++ edsnlp/utils/span_getters.py | 3 +++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py index 0a932be70..3e0660419 100644 --- a/edsnlp/core/pipeline.py +++ b/edsnlp/core/pipeline.py @@ -761,7 +761,13 @@ def to_disk( if ( os.path.exists(path) and os.listdir(path) - and not os.path.exists(path / "config.cfg") + and not ( + os.path.exists(path / "config.cfg") or + ( + os.path.exists(path / "meta.json") and + os.path.exists(path / "tokenizer") + ) + ) ): raise Exception( "The directory already exists and doesn't appear to be a" diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py index b7111cbe2..f30a82caf 100644 --- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py +++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py @@ -474,11 +474,17 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput: # - `negated=False` and `negated=True` for group_idx, bindings_indexer in enumerate(self.bindings_indexers): if "targets" in batch: + # print("BATCH") + # print(span_embeds.shape) + # print(batch.keys()) + # print(batch["targets"][:,].shape) + # print(batch["targets"][:,].sum().item()) losses.append( F.cross_entropy( binding_scores[:, bindings_indexer], batch["targets"][:, group_idx], reduction="sum", + weight=torch.tensor([0.7,1.9]).to(binding_scores.device) ) ) assert not torch.isnan(losses[-1]).any(), "NaN loss" diff --git a/edsnlp/train.py b/edsnlp/train.py index 8f0c01821..1f22220dc 100644 --- a/edsnlp/train.py +++ b/edsnlp/train.py @@ -588,6 +588,8 @@ def train( all_metrics = [] nlp.train(True) set_seed(seed) + + n_seen_samples = 0 with RichTablePrinter(LOGGER_FIELDS, auto_refresh=False) as logger: with tqdm( @@ -618,9 +620,13 @@ def train( mini_batches = next(iterator) optimizer.zero_grad() for mini_batch in mini_batches: + seen = False loss = torch.zeros((), device=accelerator.device) with nlp.cache(): for name, pipe in zip(pipe_names, trained_pipes): + if not seen: + n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0] + print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {1 + (n_seen_samples // 4464)}") output = pipe(mini_batch[name]) if "loss" in output: loss += output["loss"] diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py index ce07acc61..dc5b8330f 100644 --- a/edsnlp/utils/span_getters.py +++ b/edsnlp/utils/span_getters.py @@ -42,11 +42,14 @@ def get_spans(doc, span_getter): if callable(span_getter): yield from span_getter(doc) return + seen = set() for key, span_filter in span_getter.items(): if key == "*": candidates = (span for group in doc.spans.values() for span in group) else: candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents + candidates = [candidate for candidate in candidates if hash(candidate) not in seen] + seen |= set(hash(candidate) for candidate in candidates) if span_filter is True: yield from candidates else: From 464cd977539f393c827d6d8128c247261668f571 Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Fri, 25 Oct 2024 15:35:39 +0000 Subject: [PATCH 10/12] fix yielding last span + allow limited repeat in dataloader --- .../span_classifier/span_classifier.py | 10 +-- edsnlp/train.py | 63 +++++++++++++++++-- 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py index f30a82caf..0400534be 100644 --- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py +++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py @@ -474,17 +474,17 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput: # - `negated=False` and `negated=True` for group_idx, bindings_indexer in enumerate(self.bindings_indexers): if "targets" in batch: - # print("BATCH") + #print("BATCH") # print(span_embeds.shape) - # print(batch.keys()) - # print(batch["targets"][:,].shape) - # print(batch["targets"][:,].sum().item()) + #print(batch.keys()) + #print(batch["targets"][:,].shape) + #print(batch["targets"][:,].sum().item()) losses.append( F.cross_entropy( binding_scores[:, bindings_indexer], batch["targets"][:, group_idx], reduction="sum", - weight=torch.tensor([0.7,1.9]).to(binding_scores.device) + weight=torch.tensor([1.9, 0.7]).to(binding_scores.device) ) ) assert not torch.isnan(losses[-1]).any(), "NaN loss" diff --git a/edsnlp/train.py b/edsnlp/train.py index 1f22220dc..3e958320a 100644 --- a/edsnlp/train.py +++ b/edsnlp/train.py @@ -153,6 +153,7 @@ def __init__( noise=1, drop_last=True, buffer_size: Optional[int] = None, + repeat: Optional[int] = None, ): self.dataset = dataset self.batch_size = batch_size @@ -160,6 +161,7 @@ def __init__( self.noise = noise self.drop_last = drop_last self.buffer_size = buffer_size + self.repeat = repeat def __iter__(self): # Shuffle the dataset @@ -193,6 +195,7 @@ def sample_len(idx, noise=True): def make_batches(): total = 0 batch = [] + n_iter = 0 for seq_size, idx in sorted_sequences: if total and total + seq_size > self.batch_size: yield batch @@ -200,6 +203,8 @@ def make_batches(): batch = [] total += seq_size batch.append(idx) + if not self.drop_last: + yield batch # Shuffle the batches in buffer that contain approximately # the full dataset to add more randomness @@ -214,12 +219,12 @@ def make_batches(): # Sort sequences by length +- some noise sorted_sequences = chain.from_iterable( sorted((sample_len(i), i) for i in range(len(self.dataset))) - for _ in repeat(None) + for _ in repeat(None, times=self.repeat) ) # Batch sorted sequences batches = make_batches() - buffers = batchify(batches, buffer_size) + buffers = batchify(batches, buffer_size, drop_last=self.drop_last) for buffer in buffers: random.shuffle(buffer) yield from buffer @@ -243,6 +248,7 @@ def __init__(self, nlp, embedding, grad_accumulation_max_tokens): self.nlp = nlp self.embedding: Transformer = embedding self.grad_accumulation_max_tokens = grad_accumulation_max_tokens + self.i = 0 def __call__(self, seq): total = 0 @@ -483,6 +489,7 @@ def train( seed: int = 42, data_seed: int = 42, max_steps: int = 1000, + max_epochs: int | None = None, batch_size: BatchSizeArg = 2000, transformer_lr: float = 5e-5, task_lr: float = 3e-4, @@ -499,6 +506,10 @@ def train( for module_name, module in pipe.named_component_modules() if isinstance(module, Transformer) ) + assert not (max_steps and max_epochs), "Use only steps or epochs" + if max_epochs: + max_steps = int(0.9*(4464 / batch_size[0])) + set_seed(seed) # Loading and adapting the training and validation data @@ -531,7 +542,38 @@ def train( trf_pipe, grad_accumulation_max_tokens=grad_accumulation_max_tokens, ), + shuffle=False, ) + + true_steps = 0 + # for b in iter(dataloader): + # true_steps += b[0]["ecci_qualifier"]["targets"].shape[0] + # print(f"True: {true_steps} / Config: 4464") + # return + + batch_sampler=LengthSortedBatchSampler( + preprocessed, + batch_size=batch_size[0], + batch_unit=batch_size[1], + ), + #print("sampler", type(batch_sampler), len(batch_sampler), batch_sampler[0]) + for b in batch_sampler: + init_batches = sorted([len(data1["ecci_qualifier/targets"]) for data1 in b.dataset]) + + dataloader = torch.utils.data.DataLoader( + preprocessed, + batch_sampler=LengthSortedBatchSampler( + preprocessed, + batch_size=batch_size[0], + batch_unit=batch_size[1], + ), + collate_fn=SubBatchCollater( + nlp, + trf_pipe, + grad_accumulation_max_tokens=grad_accumulation_max_tokens, + ), + ) + pipe_names, trained_pipes = zip(*nlp.torch_components()) print("Training", ", ".join(pipe_names)) @@ -584,12 +626,14 @@ def train( cumulated_data = defaultdict(lambda: 0.0, count=0) - iterator = itertools.chain.from_iterable(itertools.repeat(dataloader)) + iterator = iter(dataloader) #itertools.chain.from_iterable(itertools.repeat(dataloader)) all_metrics = [] nlp.train(True) set_seed(seed) n_seen_samples = 0 + epoch = 0 + true_batches = [] with RichTablePrinter(LOGGER_FIELDS, auto_refresh=False) as logger: with tqdm( @@ -599,6 +643,10 @@ def train( mininterval=5.0, ) as bar: for step in bar: + #print("step ", step) + if epoch > max_epochs: + print(f"Done, left steps: {max_steps - step}") + # break if (step % validation_interval) == 0: scores = scorer(nlp, val_docs) all_metrics.append( @@ -616,17 +664,21 @@ def train( ) logger.log_metrics(flatten_dict(all_metrics[-1])) if step == max_steps: + print(f"Done, epoch {epoch}") break mini_batches = next(iterator) optimizer.zero_grad() for mini_batch in mini_batches: + # print("mini", mini_batch["ecci_qualifier"]["targets"].shape[0]) + true_batches.append(mini_batch["ecci_qualifier"]["targets"].shape[0]) seen = False loss = torch.zeros((), device=accelerator.device) with nlp.cache(): for name, pipe in zip(pipe_names, trained_pipes): if not seen: n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0] - print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {1 + (n_seen_samples // 4464)}") + epoch = 1 + (n_seen_samples / 4464) + #print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {epoch}") output = pipe(mini_batch[name]) if "loss" in output: loss += output["loss"] @@ -640,6 +692,9 @@ def train( torch.nn.utils.clip_grad_norm_(grad_params, max_grad_norm) optimizer.step() + + print(init_batches) + print(sorted(true_batches)) return nlp From 514157cc95f8e369482f44f751f2225e11eab294 Mon Sep 17 00:00:00 2001 From: Thomas Petit-Jean Date: Thu, 14 Nov 2024 16:12:18 +0100 Subject: [PATCH 11/12] continue --- .../span_classifier/span_classifier.py | 4 + edsnlp/train.py | 83 ++++++++++++++++++- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py index 0400534be..61f8ce6e5 100644 --- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py +++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py @@ -484,7 +484,11 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput: binding_scores[:, bindings_indexer], batch["targets"][:, group_idx], reduction="sum", +<<<<<<< HEAD weight=torch.tensor([1.9, 0.7]).to(binding_scores.device) +======= + weight=torch.tensor([1.9, 0.7]).to(binding_scores.device), +>>>>>>> cc94186fc (continue) ) ) assert not torch.isnan(losses[-1]).any(), "NaN loss" diff --git a/edsnlp/train.py b/edsnlp/train.py index 3e958320a..fe48b24c3 100644 --- a/edsnlp/train.py +++ b/edsnlp/train.py @@ -1,4 +1,3 @@ -import itertools import json import math import random @@ -143,6 +142,9 @@ class LengthSortedBatchSampler: buffer_size: Optional[int] The size of the buffer to use to shuffle the batches. If None, the buffer will be approximately the size of the dataset. + repeat: Optional[int] + How many time will the sampler iterate over the dataset. If None, + iterates indefinitely. """ def __init__( @@ -151,7 +153,7 @@ def __init__( batch_size: int, batch_unit: str, noise=1, - drop_last=True, + drop_last=False, buffer_size: Optional[int] = None, repeat: Optional[int] = None, ): @@ -162,6 +164,12 @@ def __init__( self.drop_last = drop_last self.buffer_size = buffer_size self.repeat = repeat +<<<<<<< HEAD +======= + + def set_repeat(self, repeat): + self.repeat = repeat +>>>>>>> cc94186fc (continue) def __iter__(self): # Shuffle the dataset @@ -183,6 +191,7 @@ def sample_len(idx, noise=True): elif self.batch_unit == "spans": def sample_len(idx, noise=True): + # TODO: implement noise here ? return len( next( v for k, v in self.dataset[idx].items() if k.endswith("begins") @@ -251,6 +260,10 @@ def __init__(self, nlp, embedding, grad_accumulation_max_tokens): self.i = 0 def __call__(self, seq): + mini_batches = self.get_mini_batches(seq) + return [self.nlp.collate(b) for b in mini_batches] + + def get_mini_batches(self, seq): total = 0 mini_batches = [[]] for sample_features in seq: @@ -269,7 +282,7 @@ def __call__(self, seq): mini_batches.append([]) total += num_tokens mini_batches[-1].append(sample_features) - return [self.nlp.collate(b) for b in mini_batches] + return mini_batches def subset_doc(doc: Doc, start: int, end: int) -> Doc: @@ -508,8 +521,12 @@ def train( ) assert not (max_steps and max_epochs), "Use only steps or epochs" if max_epochs: +<<<<<<< HEAD max_steps = int(0.9*(4464 / batch_size[0])) +======= + max_steps = int(0.9 * (4464 / batch_size[0])) +>>>>>>> cc94186fc (continue) set_seed(seed) # Loading and adapting the training and validation data @@ -530,8 +547,33 @@ def train( show_progress=True ) ) + + batch_sampler = LengthSortedBatchSampler( + preprocessed, + batch_size=batch_size[0], + batch_unit=batch_size[1], + repeat=max_epochs, + ) + collate_fn = SubBatchCollater( + nlp, + trf_pipe, + grad_accumulation_max_tokens=grad_accumulation_max_tokens, + ) + + if max_epochs is not None: + # we have to make a dry run + batch_sampler.set_repeat(repeat=1) # single epoch + for batch in batch_sampler: + batch_collated = collate_fn.get_mini_batches(batch) + n_true_steps = len(batch_collated) + print(f"True number of steps: {n_true_steps}") + max_steps = max_epochs * n_true_steps + # TODO show mean batch size ? + batch_sampler.set_repeat(repeat=repeat) + dataloader = torch.utils.data.DataLoader( preprocessed, +<<<<<<< HEAD batch_sampler=LengthSortedBatchSampler( preprocessed, batch_size=batch_size[0], @@ -574,6 +616,13 @@ def train( ), ) +======= + batch_sampler=batch_sampler, + collate_fn=collate_fn, + shuffle=False, + ) + +>>>>>>> cc94186fc (continue) pipe_names, trained_pipes = zip(*nlp.torch_components()) print("Training", ", ".join(pipe_names)) @@ -626,11 +675,16 @@ def train( cumulated_data = defaultdict(lambda: 0.0, count=0) +<<<<<<< HEAD iterator = iter(dataloader) #itertools.chain.from_iterable(itertools.repeat(dataloader)) +======= + # TODO: maybe back to: itertools.chain.from_iterable(itertools.repeat(dataloader)) + iterator = iter(dataloader) +>>>>>>> cc94186fc (continue) all_metrics = [] nlp.train(True) set_seed(seed) - + n_seen_samples = 0 epoch = 0 true_batches = [] @@ -643,8 +697,13 @@ def train( mininterval=5.0, ) as bar: for step in bar: +<<<<<<< HEAD #print("step ", step) if epoch > max_epochs: +======= + # print("step ", step) + if max_epochs and (epoch > max_epochs): +>>>>>>> cc94186fc (continue) print(f"Done, left steps: {max_steps - step}") # break if (step % validation_interval) == 0: @@ -670,15 +729,29 @@ def train( optimizer.zero_grad() for mini_batch in mini_batches: # print("mini", mini_batch["ecci_qualifier"]["targets"].shape[0]) +<<<<<<< HEAD true_batches.append(mini_batch["ecci_qualifier"]["targets"].shape[0]) +======= + true_batches.append( + mini_batch["ecci_qualifier"]["targets"].shape[0] + ) +>>>>>>> cc94186fc (continue) seen = False loss = torch.zeros((), device=accelerator.device) with nlp.cache(): for name, pipe in zip(pipe_names, trained_pipes): if not seen: +<<<<<<< HEAD n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0] epoch = 1 + (n_seen_samples / 4464) #print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {epoch}") +======= + n_seen_samples += mini_batch["ecci_qualifier"][ + "targets" + ].shape[0] + epoch = 1 + (n_seen_samples / 4464) + # print(f"{step} - Seen:{n_seen_samples} - Epoch:{epoch}") +>>>>>>> cc94186fc (continue) output = pipe(mini_batch[name]) if "loss" in output: loss += output["loss"] @@ -696,6 +769,8 @@ def train( print(init_batches) print(sorted(true_batches)) + print(sorted(true_batches)) + return nlp From e08483d0dd9f20e78111602316670781a535f12d Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Mon, 18 Nov 2024 13:48:49 +0000 Subject: [PATCH 12/12] fix merge conflicts --- .../span_classifier/span_classifier.py | 4 - edsnlp/train.py | 73 +------------------ 2 files changed, 2 insertions(+), 75 deletions(-) diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py index 61f8ce6e5..0973ed171 100644 --- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py +++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py @@ -484,11 +484,7 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput: binding_scores[:, bindings_indexer], batch["targets"][:, group_idx], reduction="sum", -<<<<<<< HEAD - weight=torch.tensor([1.9, 0.7]).to(binding_scores.device) -======= weight=torch.tensor([1.9, 0.7]).to(binding_scores.device), ->>>>>>> cc94186fc (continue) ) ) assert not torch.isnan(losses[-1]).any(), "NaN loss" diff --git a/edsnlp/train.py b/edsnlp/train.py index fe48b24c3..51c4ccecc 100644 --- a/edsnlp/train.py +++ b/edsnlp/train.py @@ -164,12 +164,9 @@ def __init__( self.drop_last = drop_last self.buffer_size = buffer_size self.repeat = repeat -<<<<<<< HEAD -======= def set_repeat(self, repeat): self.repeat = repeat ->>>>>>> cc94186fc (continue) def __iter__(self): # Shuffle the dataset @@ -521,13 +518,7 @@ def train( ) assert not (max_steps and max_epochs), "Use only steps or epochs" if max_epochs: -<<<<<<< HEAD - max_steps = int(0.9*(4464 / batch_size[0])) - -======= max_steps = int(0.9 * (4464 / batch_size[0])) ->>>>>>> cc94186fc (continue) - set_seed(seed) # Loading and adapting the training and validation data with set_seed(data_seed): @@ -570,10 +561,9 @@ def train( max_steps = max_epochs * n_true_steps # TODO show mean batch size ? batch_sampler.set_repeat(repeat=repeat) - + dataloader = torch.utils.data.DataLoader( preprocessed, -<<<<<<< HEAD batch_sampler=LengthSortedBatchSampler( preprocessed, batch_size=batch_size[0], @@ -587,42 +577,6 @@ def train( shuffle=False, ) - true_steps = 0 - # for b in iter(dataloader): - # true_steps += b[0]["ecci_qualifier"]["targets"].shape[0] - # print(f"True: {true_steps} / Config: 4464") - # return - - batch_sampler=LengthSortedBatchSampler( - preprocessed, - batch_size=batch_size[0], - batch_unit=batch_size[1], - ), - #print("sampler", type(batch_sampler), len(batch_sampler), batch_sampler[0]) - for b in batch_sampler: - init_batches = sorted([len(data1["ecci_qualifier/targets"]) for data1 in b.dataset]) - - dataloader = torch.utils.data.DataLoader( - preprocessed, - batch_sampler=LengthSortedBatchSampler( - preprocessed, - batch_size=batch_size[0], - batch_unit=batch_size[1], - ), - collate_fn=SubBatchCollater( - nlp, - trf_pipe, - grad_accumulation_max_tokens=grad_accumulation_max_tokens, - ), - ) - -======= - batch_sampler=batch_sampler, - collate_fn=collate_fn, - shuffle=False, - ) - ->>>>>>> cc94186fc (continue) pipe_names, trained_pipes = zip(*nlp.torch_components()) print("Training", ", ".join(pipe_names)) @@ -675,12 +629,8 @@ def train( cumulated_data = defaultdict(lambda: 0.0, count=0) -<<<<<<< HEAD - iterator = iter(dataloader) #itertools.chain.from_iterable(itertools.repeat(dataloader)) -======= # TODO: maybe back to: itertools.chain.from_iterable(itertools.repeat(dataloader)) iterator = iter(dataloader) ->>>>>>> cc94186fc (continue) all_metrics = [] nlp.train(True) set_seed(seed) @@ -697,15 +647,9 @@ def train( mininterval=5.0, ) as bar: for step in bar: -<<<<<<< HEAD - #print("step ", step) - if epoch > max_epochs: -======= - # print("step ", step) if max_epochs and (epoch > max_epochs): ->>>>>>> cc94186fc (continue) print(f"Done, left steps: {max_steps - step}") - # break + break if (step % validation_interval) == 0: scores = scorer(nlp, val_docs) all_metrics.append( @@ -728,30 +672,19 @@ def train( mini_batches = next(iterator) optimizer.zero_grad() for mini_batch in mini_batches: - # print("mini", mini_batch["ecci_qualifier"]["targets"].shape[0]) -<<<<<<< HEAD - true_batches.append(mini_batch["ecci_qualifier"]["targets"].shape[0]) -======= true_batches.append( mini_batch["ecci_qualifier"]["targets"].shape[0] ) ->>>>>>> cc94186fc (continue) seen = False loss = torch.zeros((), device=accelerator.device) with nlp.cache(): for name, pipe in zip(pipe_names, trained_pipes): if not seen: -<<<<<<< HEAD - n_seen_samples += mini_batch["ecci_qualifier"]["targets"].shape[0] - epoch = 1 + (n_seen_samples / 4464) - #print(f"Step: {step} - Seen: {n_seen_samples} - Epoch: {epoch}") -======= n_seen_samples += mini_batch["ecci_qualifier"][ "targets" ].shape[0] epoch = 1 + (n_seen_samples / 4464) # print(f"{step} - Seen:{n_seen_samples} - Epoch:{epoch}") ->>>>>>> cc94186fc (continue) output = pipe(mini_batch[name]) if "loss" in output: loss += output["loss"] @@ -769,8 +702,6 @@ def train( print(init_batches) print(sorted(true_batches)) - print(sorted(true_batches)) - return nlp