diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 06aac1ac1..d182c0ae0 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -13,7 +13,7 @@ env: jobs: Documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 65de04d5a..b6b9198d0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-22.04, windows-latest, macos-latest] steps: - uses: actions/checkout@v4 @@ -42,7 +42,7 @@ jobs: build_sdist: name: Build source distribution - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -58,7 +58,7 @@ jobs: name: Upload to PyPI needs: [build_wheels, build_sdist] - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/download-artifact@v4 @@ -76,7 +76,7 @@ jobs: # repository_url: https://test.pypi.org/legacy/ Documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 0b849728b..ac64a28b3 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -17,7 +17,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-22.04, windows-latest, macos-latest] steps: - uses: actions/checkout@v2 @@ -30,7 +30,7 @@ jobs: build_sdist: name: Build source distribution - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 728547434..f55139d32 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: linting: name: Linting if: github.event_name == 'pull_request' - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 with: @@ -32,7 +32,7 @@ jobs: pytest: name: Pytest - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: fail-fast: true matrix: @@ -120,7 +120,7 @@ jobs: documentation: name: Documentation - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -150,7 +150,7 @@ jobs: simple-installation: name: Simple installation - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: fail-fast: true matrix: diff --git a/changelog.md b/changelog.md index c88ce1d74..1d1ae7146 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,19 @@ # Changelog +## Unreleased + +### Added + +- `EDS.Tokenizer` now handles `-\n` (found in text when spliting a long word with a linebreak) as a specific token, which can be discarded by the normalizer pipe. + +### Fixed + +- Use `ubuntu-22` instead of `ubuntu-latest` in CI to keep `python 3.7` compatibility +- When using `ignore_space_tokens=True`, words separated only by linebreaks will be collected (via `get_text()`) with spaces inbetween +- The `process` method of `Qualifiers` now accepts `Span` as input, an treats it as a `Doc` to avoid alignment issues +- The `detailed_status_mapping` of disorder/behavior pipes not handles the previous `KeyError: None` that can occur when loading pre-annotated docs without instanciating pipes beforehands +- Various fixes on the Alcohol and Tobacco pipes + ## v0.13.1 ### Added diff --git a/docs/pipes/ner/behaviors/alcohol.md b/docs/pipes/ner/behaviors/alcohol.md index f8c5772fb..be987c188 100644 --- a/docs/pipes/ner/behaviors/alcohol.md +++ b/docs/pipes/ner/behaviors/alcohol.md @@ -1,5 +1,7 @@ # Alcohol consumption {: #edsnlp.pipes.ner.behaviors.alcohol.factory.create_component } +--8<-- "docs/pipes/ner/disorders/warning.md" + ::: edsnlp.pipes.ner.behaviors.alcohol.factory.create_component options: heading_level: 2 diff --git a/docs/pipes/ner/behaviors/index.md b/docs/pipes/ner/behaviors/index.md index 8544c0255..aac46ed71 100644 --- a/docs/pipes/ner/behaviors/index.md +++ b/docs/pipes/ner/behaviors/index.md @@ -2,99 +2,6 @@ ## Presentation -EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the ContextualMatcher component. -Some general considerations about those components: +EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component. -- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`. -- The matched comorbidity is also available under the `ent.label_` of each match. -- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details. -- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute. -- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters: - ```{ .python .no-check } - nlp.add_pipe( - eds.normalizer( - accents=True, - lowercase=True, - quotes=True, - spaces=True, - pollution=dict( - information=True, - bars=True, - biology=True, - doctors=True, - web=True, - coding=True, - footer=True, - ), - ), - ) - ``` - -!!! warning "Use qualifiers" - Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet. - - !!! aphp "Use the ML model" - - The model will soon be available in the models catalogue of AP-HP's CDW. - -## Usage - -```{ .python .no-check } -import edsnlp, edsnlp.pipes as eds - -nlp = edsnlp.blank("eds") -nlp.add_pipe(eds.sentences()) -nlp.add_pipe( - eds.normalizer( - accents=True, - lowercase=True, - quotes=True, - spaces=True, - pollution=dict( - information=True, - bars=True, - biology=True, - doctors=True, - web=True, - coding=True, - footer=True, - ), - ), -) -nlp.add_pipe(eds.tobacco()) -nlp.add_pipe(eds.diabetes()) - -text = """ -Compte-rendu de consultation. - -Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique. -Le patient va bien depuis la dernière fois. -Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année). - -Sur le plan de son diabète, la glycémie est stable. -""" - -doc = nlp(text) - -doc.spans -# Out: { -# 'pollutions': [], -# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année], -# 'diabetes': [rétinopathie diabétique, diabète] -# } - -tobacco_matches = doc.spans["tobacco"] -tobacco_matches[0]._.detailed_status -# Out: "ABSTINENCE" # - -tobacco_matches[0]._.assigned["PA"] # paquet-année -# Out: 10 # (1) - - -diabetes = doc.spans["diabetes"] -(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status) -# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (2) -``` - -1. Here we see an example of additional information that can be extracted -2. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity. +--8<-- "docs/pipes/ner/disorders/presentation.md" diff --git a/docs/pipes/ner/disorders/index.md b/docs/pipes/ner/disorders/index.md index e261fcd98..ad0321b0b 100644 --- a/docs/pipes/ner/disorders/index.md +++ b/docs/pipes/ner/disorders/index.md @@ -2,58 +2,6 @@ ## Presentation -The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the ContextualMatcher component. +The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the [ContextualMatcher][edsnlp.pipes.core.contextual_matcher.ContextualMatcher] matcher, itself based on `eds.contextual_matcher` component. -The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024] - -Some general considerations about those components: - -- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`. -- The matched comorbidity is also available under the `ent.label_` of each match. -- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details. -- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute. -- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters: - - ```{ .python .no-check } - import edsnlp, edsnlp.pipes as eds - ... - - nlp.add_pipe( - eds.normalizer( - accents=True, - lowercase=True, - quotes=True, - spaces=True, - pollution=dict( - information=True, - bars=True, - biology=True, - doctors=True, - web=True, - coding=True, - footer=True, - ), - ), - ) - ``` - -!!! warning "Use qualifiers" - Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet. - - !!! aphp "Use the ML model" - - The model will soon be available in the models catalogue of AP-HP's CDW. - -!!! tip "On the medical definition of the comorbidities" - - Those components were developped to extract **chronic** and **symptomatic** conditions only. - -## Aggregation - -For relevant phenotyping, matches should be aggregated at the document-level. For instance, a document might mention a complicated diabetes at the beginning ("*Le patient a une rétinopathie diabétique*"), and then refer to this diabetes without mentionning that it is complicated anymore ("*Concernant son diabète, le patient ...*"). -Thus, a good and simple aggregation rule is, for each comorbidity, to - -- disregard all entities tagged as irrelevant by the qualification component(s) -- take the maximum (i.e., the most severe) status of the leftover entities - -An implementation of this rule is presented [here][aggregating-results] +--8<-- "docs/pipes/ner/disorders/presentation.md" diff --git a/docs/pipes/ner/disorders/presentation.md b/docs/pipes/ner/disorders/presentation.md new file mode 100644 index 000000000..1918867a8 --- /dev/null +++ b/docs/pipes/ner/disorders/presentation.md @@ -0,0 +1,77 @@ +The components were developed by AP-HP's Data Science team with a team of medical experts, following the insights of the algorithm proposed by [@petitjean_2024] + +Some general considerations about those components: + +- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`. +- The matched comorbidity is also available under the `ent.label_` of each match. +- Matches have an associated `_.status` attribute taking the value `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details. +- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute. +- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline (see [Usage](#usage) below) + +--8<-- "docs/pipes/ner/disorders/warning.md" + +!!! warning "Use qualifiers" + Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you should use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet. + + !!! aphp "Use the ML model" + + For projects working on AP-HP's CDW, this model is available via its models catalogue. + +## Usage + +```{ .python .no-check } +import edsnlp, edsnlp.pipes as eds + +nlp = edsnlp.blank("eds") +nlp.add_pipe(eds.sentences()) +nlp.add_pipe( + eds.normalizer( + accents=True, + lowercase=True, + quotes=True, + spaces=True, + pollution=dict( + biology=True, #(1) + coding=True, #(2) + ), + ), +) +nlp.add_pipe(eds.tobacco()) +nlp.add_pipe(eds.diabetes()) + +text = """ +Compte-rendu de consultation. + +Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique. +Le patient va bien depuis la dernière fois. +Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année). + +Sur le plan de son diabète, la glycémie est stable. +""" + +doc = nlp(text) + +doc.spans +# Out: { +# 'pollutions': [], +# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année], +# 'diabetes': [rétinopathie diabétique, diabète] +# } + +tobacco_matches = doc.spans["tobacco"] +tobacco_matches[0]._.detailed_status +# Out: "ABSTINENCE" # + +tobacco_matches[0]._.assigned["PA"] # paquet-année +# Out: 10 # (3) + + +diabetes = doc.spans["diabetes"] +(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status) +# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (4) +``` + +1. This will discard mentions of biology results, which often leads to false positive +2. This will discard mentions of ICD10 coding that sometimes appears at the end of clinical documents +3. Here we see an example of additional information that can be extracted +4. Here we see the importance of document-level aggregation to extract the correct severity of each comorbidity. diff --git a/docs/pipes/ner/disorders/warning.md b/docs/pipes/ner/disorders/warning.md new file mode 100644 index 000000000..6268cf664 --- /dev/null +++ b/docs/pipes/ner/disorders/warning.md @@ -0,0 +1,7 @@ +!!! danger "On overlapping entities" + When using multiple disorders or behavior pipelines, some entities may be extracted from different pipes. For instance: + + * "Intoxication éthylotabagique" will be tagged both by `eds.tobacco` and `eds.alcohol` + * "Chirrose alcoolique" will be tagged both by `eds.liver_disease` and `eds.alcohol` + + As `doc.ents` discards overlapping entities, you should use `doc.spans` instead. diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py index 0a932be70..3e0660419 100644 --- a/edsnlp/core/pipeline.py +++ b/edsnlp/core/pipeline.py @@ -761,7 +761,13 @@ def to_disk( if ( os.path.exists(path) and os.listdir(path) - and not os.path.exists(path / "config.cfg") + and not ( + os.path.exists(path / "config.cfg") or + ( + os.path.exists(path / "meta.json") and + os.path.exists(path / "tokenizer") + ) + ) ): raise Exception( "The directory already exists and doesn't appear to be a" diff --git a/edsnlp/language.py b/edsnlp/language.py index f61c804b3..01ccae292 100644 --- a/edsnlp/language.py +++ b/edsnlp/language.py @@ -41,7 +41,15 @@ class EDSLanguage(French): Defaults = EDSDefaults -TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))"] +TOKENIZER_EXCEPTIONS = [ + r"Dr\.", + r"Pr\.", + r"M\.", + r"Mme\.", + r"Mlle\.", + r"(?i:(?:ep\.))", + r"(? List[Span]: stopped = self.negation.process(span) if not any(stopped_token.negation for stopped_token in stopped.tokens): span._.status = 2 - - if "zero_after" in span._.assigned.keys(): - span._.negation = True + else: + if "zero_after" in span._.assigned.keys(): + span._.negation = True yield span diff --git a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py index 38c795926..7777225d3 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py @@ -3,7 +3,7 @@ regex=[ r"\balco[ol]", r"\bethyl", - r"(? Doc: annotated spaCy Doc object """ spans = list(self.process(doc)) + all_detailed_status = set(self.detailed_status_mapping.keys()) for span in spans: - span._.detailed_status = self.detailed_status_mapping[span._.status] + if span._.status is not None and span._.status not in all_detailed_status: + raise ValueError( + f"Got incorrect status value for '{span}'. Expected " + f"None or one of {all_detailed_status}, got {span._.status}" + ) + span._.detailed_status = self.detailed_status_mapping.get( + span._.status, + None, + ) self.set_spans(doc, filter_spans(spans)) diff --git a/edsnlp/pipes/qualifiers/base.py b/edsnlp/pipes/qualifiers/base.py index 521baf0e6..0b41fac9e 100644 --- a/edsnlp/pipes/qualifiers/base.py +++ b/edsnlp/pipes/qualifiers/base.py @@ -172,9 +172,13 @@ def get_matches(self, doc: Doc) -> List[Span]: return list(matches) - def process(self, doc: Doc) -> BaseQualifierResults: + def ensure_doc(self, doc: Union[Doc, Span]) -> Doc: + return doc if not hasattr(doc, "as_doc") else doc.as_doc() + + def process(self, doc_like: Union[Doc, Span]) -> BaseQualifierResults: + doc_like = self.ensure_doc(doc_like) # pragma: no cover raise NotImplementedError def __call__(self, doc: Doc) -> Doc: - results = self.process(doc) + results = self.process(doc) # pragma: no cover raise NotImplementedError(f"{type(results)} should be used to tag the document") diff --git a/edsnlp/pipes/qualifiers/family/family.py b/edsnlp/pipes/qualifiers/family/family.py index e979e71de..12943d9eb 100644 --- a/edsnlp/pipes/qualifiers/family/family.py +++ b/edsnlp/pipes/qualifiers/family/family.py @@ -187,7 +187,8 @@ def set_extensions(self) -> None: if not Doc.has_extension("family"): Doc.set_extension("family", default=[]) - def process(self, doc: Doc) -> FamilyResults: + def process(self, doc_like: Union[Doc, Span]) -> FamilyResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) terminations = [m for m in matches if m.label_ == "termination"] diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py index 2dc8d56d8..ab5469c95 100644 --- a/edsnlp/pipes/qualifiers/history/history.py +++ b/edsnlp/pipes/qualifiers/history/history.py @@ -326,7 +326,8 @@ def set_extensions(self) -> None: getter=deprecated_getter_factory("antecedent_cues", "history_cues"), ) - def process(self, doc: Doc) -> HistoryResults: + def process(self, doc_like: Union[Doc, Span]) -> HistoryResults: + doc = self.ensure_doc(doc_like) note_datetime = None if doc._.note_datetime is not None: try: diff --git a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py index 924d2cf63..68ed1d1d1 100644 --- a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py +++ b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py @@ -262,7 +262,8 @@ def load_verbs( list_hypo_verbs_following, ) - def process(self, doc: Doc) -> HypothesisResults: + def process(self, doc_like: Union[Doc, Span]) -> HypothesisResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) terminations = [m for m in matches if m.label_ == "termination"] diff --git a/edsnlp/pipes/qualifiers/negation/negation.py b/edsnlp/pipes/qualifiers/negation/negation.py index fb2c7878f..ea4a4fc40 100644 --- a/edsnlp/pipes/qualifiers/negation/negation.py +++ b/edsnlp/pipes/qualifiers/negation/negation.py @@ -295,7 +295,8 @@ def __call__(self, doc: Doc) -> Doc: token._.negation = True return doc - def process(self, doc: Doc) -> NegationResults: + def process(self, doc_like: Union[Doc, Span]) -> NegationResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) terminations = [m for m in matches if m.label_ == "termination"] diff --git a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py index 77b0cbe91..759eb7091 100644 --- a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py +++ b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py @@ -226,7 +226,8 @@ def load_verbs(self, verbs: List[str]) -> List[str]: return list_rep_verbs - def process(self, doc: Doc) -> ReportedSpeechResults: + def process(self, doc_like: Union[Doc, Span]) -> ReportedSpeechResults: + doc = self.ensure_doc(doc_like) matches = self.get_matches(doc) matches += list(self.regex_matcher(doc, as_spans=True)) diff --git a/edsnlp/pipes/trainable/span_classifier/span_classifier.py b/edsnlp/pipes/trainable/span_classifier/span_classifier.py index b7111cbe2..0973ed171 100644 --- a/edsnlp/pipes/trainable/span_classifier/span_classifier.py +++ b/edsnlp/pipes/trainable/span_classifier/span_classifier.py @@ -474,11 +474,17 @@ def forward(self, batch: SpanClassifierBatchInput) -> BatchOutput: # - `negated=False` and `negated=True` for group_idx, bindings_indexer in enumerate(self.bindings_indexers): if "targets" in batch: + #print("BATCH") + # print(span_embeds.shape) + #print(batch.keys()) + #print(batch["targets"][:,].shape) + #print(batch["targets"][:,].sum().item()) losses.append( F.cross_entropy( binding_scores[:, bindings_indexer], batch["targets"][:, group_idx], reduction="sum", + weight=torch.tensor([1.9, 0.7]).to(binding_scores.device), ) ) assert not torch.isnan(losses[-1]).any(), "NaN loss" diff --git a/edsnlp/train.py b/edsnlp/train.py index 8f0c01821..51c4ccecc 100644 --- a/edsnlp/train.py +++ b/edsnlp/train.py @@ -1,4 +1,3 @@ -import itertools import json import math import random @@ -143,6 +142,9 @@ class LengthSortedBatchSampler: buffer_size: Optional[int] The size of the buffer to use to shuffle the batches. If None, the buffer will be approximately the size of the dataset. + repeat: Optional[int] + How many time will the sampler iterate over the dataset. If None, + iterates indefinitely. """ def __init__( @@ -151,8 +153,9 @@ def __init__( batch_size: int, batch_unit: str, noise=1, - drop_last=True, + drop_last=False, buffer_size: Optional[int] = None, + repeat: Optional[int] = None, ): self.dataset = dataset self.batch_size = batch_size @@ -160,6 +163,10 @@ def __init__( self.noise = noise self.drop_last = drop_last self.buffer_size = buffer_size + self.repeat = repeat + + def set_repeat(self, repeat): + self.repeat = repeat def __iter__(self): # Shuffle the dataset @@ -181,6 +188,7 @@ def sample_len(idx, noise=True): elif self.batch_unit == "spans": def sample_len(idx, noise=True): + # TODO: implement noise here ? return len( next( v for k, v in self.dataset[idx].items() if k.endswith("begins") @@ -193,6 +201,7 @@ def sample_len(idx, noise=True): def make_batches(): total = 0 batch = [] + n_iter = 0 for seq_size, idx in sorted_sequences: if total and total + seq_size > self.batch_size: yield batch @@ -200,6 +209,8 @@ def make_batches(): batch = [] total += seq_size batch.append(idx) + if not self.drop_last: + yield batch # Shuffle the batches in buffer that contain approximately # the full dataset to add more randomness @@ -214,12 +225,12 @@ def make_batches(): # Sort sequences by length +- some noise sorted_sequences = chain.from_iterable( sorted((sample_len(i), i) for i in range(len(self.dataset))) - for _ in repeat(None) + for _ in repeat(None, times=self.repeat) ) # Batch sorted sequences batches = make_batches() - buffers = batchify(batches, buffer_size) + buffers = batchify(batches, buffer_size, drop_last=self.drop_last) for buffer in buffers: random.shuffle(buffer) yield from buffer @@ -243,8 +254,13 @@ def __init__(self, nlp, embedding, grad_accumulation_max_tokens): self.nlp = nlp self.embedding: Transformer = embedding self.grad_accumulation_max_tokens = grad_accumulation_max_tokens + self.i = 0 def __call__(self, seq): + mini_batches = self.get_mini_batches(seq) + return [self.nlp.collate(b) for b in mini_batches] + + def get_mini_batches(self, seq): total = 0 mini_batches = [[]] for sample_features in seq: @@ -263,7 +279,7 @@ def __call__(self, seq): mini_batches.append([]) total += num_tokens mini_batches[-1].append(sample_features) - return [self.nlp.collate(b) for b in mini_batches] + return mini_batches def subset_doc(doc: Doc, start: int, end: int) -> Doc: @@ -483,6 +499,7 @@ def train( seed: int = 42, data_seed: int = 42, max_steps: int = 1000, + max_epochs: int | None = None, batch_size: BatchSizeArg = 2000, transformer_lr: float = 5e-5, task_lr: float = 3e-4, @@ -499,7 +516,9 @@ def train( for module_name, module in pipe.named_component_modules() if isinstance(module, Transformer) ) - + assert not (max_steps and max_epochs), "Use only steps or epochs" + if max_epochs: + max_steps = int(0.9 * (4464 / batch_size[0])) set_seed(seed) # Loading and adapting the training and validation data with set_seed(data_seed): @@ -519,6 +538,30 @@ def train( show_progress=True ) ) + + batch_sampler = LengthSortedBatchSampler( + preprocessed, + batch_size=batch_size[0], + batch_unit=batch_size[1], + repeat=max_epochs, + ) + collate_fn = SubBatchCollater( + nlp, + trf_pipe, + grad_accumulation_max_tokens=grad_accumulation_max_tokens, + ) + + if max_epochs is not None: + # we have to make a dry run + batch_sampler.set_repeat(repeat=1) # single epoch + for batch in batch_sampler: + batch_collated = collate_fn.get_mini_batches(batch) + n_true_steps = len(batch_collated) + print(f"True number of steps: {n_true_steps}") + max_steps = max_epochs * n_true_steps + # TODO show mean batch size ? + batch_sampler.set_repeat(repeat=repeat) + dataloader = torch.utils.data.DataLoader( preprocessed, batch_sampler=LengthSortedBatchSampler( @@ -531,7 +574,9 @@ def train( trf_pipe, grad_accumulation_max_tokens=grad_accumulation_max_tokens, ), + shuffle=False, ) + pipe_names, trained_pipes = zip(*nlp.torch_components()) print("Training", ", ".join(pipe_names)) @@ -584,11 +629,16 @@ def train( cumulated_data = defaultdict(lambda: 0.0, count=0) - iterator = itertools.chain.from_iterable(itertools.repeat(dataloader)) + # TODO: maybe back to: itertools.chain.from_iterable(itertools.repeat(dataloader)) + iterator = iter(dataloader) all_metrics = [] nlp.train(True) set_seed(seed) + n_seen_samples = 0 + epoch = 0 + true_batches = [] + with RichTablePrinter(LOGGER_FIELDS, auto_refresh=False) as logger: with tqdm( range(max_steps + 1), @@ -597,6 +647,9 @@ def train( mininterval=5.0, ) as bar: for step in bar: + if max_epochs and (epoch > max_epochs): + print(f"Done, left steps: {max_steps - step}") + break if (step % validation_interval) == 0: scores = scorer(nlp, val_docs) all_metrics.append( @@ -614,13 +667,24 @@ def train( ) logger.log_metrics(flatten_dict(all_metrics[-1])) if step == max_steps: + print(f"Done, epoch {epoch}") break mini_batches = next(iterator) optimizer.zero_grad() for mini_batch in mini_batches: + true_batches.append( + mini_batch["ecci_qualifier"]["targets"].shape[0] + ) + seen = False loss = torch.zeros((), device=accelerator.device) with nlp.cache(): for name, pipe in zip(pipe_names, trained_pipes): + if not seen: + n_seen_samples += mini_batch["ecci_qualifier"][ + "targets" + ].shape[0] + epoch = 1 + (n_seen_samples / 4464) + # print(f"{step} - Seen:{n_seen_samples} - Epoch:{epoch}") output = pipe(mini_batch[name]) if "loss" in output: loss += output["loss"] @@ -634,6 +698,9 @@ def train( torch.nn.utils.clip_grad_norm_(grad_params, max_grad_norm) optimizer.step() + + print(init_batches) + print(sorted(true_batches)) return nlp diff --git a/edsnlp/utils/doc_to_text.py b/edsnlp/utils/doc_to_text.py index b9ea7043e..2a525e94a 100644 --- a/edsnlp/utils/doc_to_text.py +++ b/edsnlp/utils/doc_to_text.py @@ -82,8 +82,8 @@ def aggregate_tokens( else: keep_list = [True] * len(arr) - for i, (str_hash, space, keep) in enumerate( - zip(tokens_text, tokens_space, keep_list) + for i, (str_hash, tag_hash, space, keep) in enumerate( + zip(tokens_text, tokens_tag, tokens_space, keep_list) ): if keep: if space: @@ -99,6 +99,12 @@ def aggregate_tokens( offset += len(part) ends[i] = offset else: + if i > 0 and tag_hash == space_hash: + if text_parts[i - 1][-1:] and ( + text_parts[i - 1][-1:] not in (" ", "\n") + ): + text_parts[i - 1] += " " + offset += 1 begins[i] = offset ends[i] = offset diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py index ce07acc61..dc5b8330f 100644 --- a/edsnlp/utils/span_getters.py +++ b/edsnlp/utils/span_getters.py @@ -42,11 +42,14 @@ def get_spans(doc, span_getter): if callable(span_getter): yield from span_getter(doc) return + seen = set() for key, span_filter in span_getter.items(): if key == "*": candidates = (span for group in doc.spans.values() for span in group) else: candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents + candidates = [candidate for candidate in candidates if hash(candidate) not in seen] + seen |= set(hash(candidate) for candidate in candidates) if span_filter is True: yield from candidates else: diff --git a/tests/pipelines/core/test_normalisation.py b/tests/pipelines/core/test_normalisation.py index 4cfa7b039..3628145bb 100644 --- a/tests/pipelines/core/test_normalisation.py +++ b/tests/pipelines/core/test_normalisation.py @@ -1,3 +1,4 @@ +import spacy from pytest import fixture from edsnlp.matchers.utils import get_text @@ -25,7 +26,6 @@ def test_full_normalization(doc): @fixture def nlp_factory(blank_nlp): def f(a=False, lc=False, q=False, p=False): - if a: a = dict(accents=accents) if q: @@ -48,7 +48,6 @@ def f(a=False, lc=False, q=False, p=False): def test_normalization_accents(nlp_factory, text): - nlp = nlp_factory(a=True) doc = nlp(text) @@ -58,7 +57,6 @@ def test_normalization_accents(nlp_factory, text): def test_normalization_spaces(nlp_factory, text): - nlp = nlp_factory(a=True) doc = nlp("Phrase avec des espaces \n et un retour à la ligne") @@ -67,7 +65,6 @@ def test_normalization_spaces(nlp_factory, text): def test_normalization_quotes(nlp_factory, text): - nlp = nlp_factory(q=True) doc = nlp(text) @@ -79,7 +76,6 @@ def test_normalization_quotes(nlp_factory, text): def test_normalization_lowercase(nlp_factory, text): - nlp = nlp_factory(lc=True) doc = nlp(text) @@ -88,8 +84,24 @@ def test_normalization_lowercase(nlp_factory, text): assert norm.startswith("l'aïeul") -def test_normalization_pollution(nlp_factory, text): +def test_normalization_pollution_with_eds_lang(): + nlp = spacy.blank("eds") + nlp.add_pipe("eds.normalizer") + text = "Il faut soigner ce diab-\nete" + doc = nlp(text) + norm = get_text(doc, attr="NORM", ignore_excluded=True) + assert norm == "il faut soigner ce diabete" + + +def test_normalization_linebreak_no_space(nlp_factory): + nlp = nlp_factory() + text = "Mode de vie: \nTabac\nAlcool\nPas de sport" + doc = nlp(text) + norm = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True) + assert norm == "Mode de vie: Tabac Alcool Pas de sport" + +def test_normalization_pollution(nlp_factory, text): nlp = nlp_factory(p=True) doc = nlp(text) diff --git a/tests/pipelines/ner/disorders/alcohol.py b/tests/pipelines/ner/disorders/alcohol.py index 19f261f5b..f9fb65234 100644 --- a/tests/pipelines/ner/disorders/alcohol.py +++ b/tests/pipelines/ner/disorders/alcohol.py @@ -9,6 +9,10 @@ True, True, True, + True, + True, + True, + False, ], detailled_status=[ None, @@ -20,6 +24,10 @@ None, "ABSTINENCE", None, + None, + "ABSTINENCE", + None, + None, ], negation=[ None, @@ -27,10 +35,14 @@ None, None, None, - None, + False, True, None, True, + False, + False, + False, + None, ], assign=None, texts=[ @@ -38,10 +50,14 @@ "OH chronique.", "Prise d'alcool occasionnelle", "Application d'un pansement alcoolisé", - "Alcoolisme sevré", - "Alcoolisme non sevré", + "Présence d'un alcoolisme sevré", + "Présence d'un alcoolisme non sevré", "Alcool: 0", "Le patient est en cours de sevrage éthylotabagique", "Patient alcoolique: non.", + "On a un alcoolique non sevré depuis 10 ans.", + "Alcoolisme sevré", + "Alcoolisme non sevré", + "Dosage vitamines 25-OH", ], ) diff --git a/tests/pipelines/ner/disorders/test_all.py b/tests/pipelines/ner/disorders/test_all.py index 7eca71125..8f37bc6e9 100644 --- a/tests/pipelines/ner/disorders/test_all.py +++ b/tests/pipelines/ner/disorders/test_all.py @@ -99,7 +99,10 @@ def check(self): for ent in ents: assert ent.label_ == self.disorder if negation is not None: - assert ent._.negation == negation + if negation: + assert ent._.negation == negation + else: + assert ent._.negation is None if not ents: continue diff --git a/tests/pipelines/ner/disorders/tobacco.py b/tests/pipelines/ner/disorders/tobacco.py index 4ea3fa934..e46b000e9 100644 --- a/tests/pipelines/ner/disorders/tobacco.py +++ b/tests/pipelines/ner/disorders/tobacco.py @@ -9,6 +9,7 @@ True, True, True, + True, ], detailled_status=[ None, @@ -20,6 +21,7 @@ "ABSTINENCE", None, None, + None, ], negation=[ None, @@ -31,6 +33,7 @@ None, True, True, + False, ], assign=[{"PA": 15}] + 8 * [None], texts=[ @@ -38,10 +41,11 @@ "Patient tabagique", "Tabagisme festif", "On a un tabagisme ancien", - "Tabac: 0", - "Tabagisme passif", - "Tabac: sevré depuis 5 ans", + "Pour le tabac: 0", + "Notion de tabagisme passif", + "Concernant le tabac: sevré depuis 5 ans", "Le patient ne fume aucun truc.", "Le patient fume 0 PA.", + "On a un tabagique non sevré depuis 10 ans.", ], ) diff --git a/tests/test_language.py b/tests/test_language.py index 369c6df12..363a7f117 100644 --- a/tests/test_language.py +++ b/tests/test_language.py @@ -58,6 +58,18 @@ def test_eds_tokenizer_whitespace(): ] +def test_eds_tokenizer_intraword_split(): + nlp = spacy.blank("eds") + tokenized = [(w.text, w.whitespace_) for w in nlp("Un dia-\nbete ici")] + assert tokenized == [ + ("Un", " "), + ("dia", ""), + ("-\n", ""), + ("bete", " "), + ("ici", ""), + ] + + def test_eds_tokenizer_numbers(): nlp = spacy.blank("eds") tokenized = [(w.text, w.whitespace_) for w in nlp("Il fait 5.3/5.4mm")]