Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ jobs:

- name: Install dependencies
run: |
pip install '.[docs]'
pip install . --group docs
# uv venv
# uv pip install '.[docs]'
# uv pip install . --group docs

- name: Set up Git
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install '.[docs]'
pip install . --group docs

- name: Set up Git
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,16 @@ jobs:
cache: 'pip'

- name: Install dependencies
run: pip install -e ".[dev]"
run: pip install -e . --group dev
if: matrix.python-version != '3.9' && matrix.python-version != '3.10' && matrix.python-version != '3.11' && matrix.python-version != '3.12'

- name: Install dependencies
run: pip install -e ".[dev,setup]"
run: pip install -e . --group dev --group setup
if: matrix.python-version == '3.9'

- name: Install dependencies
# skip ML tests for 3.10 and 3.11
run: pip install -e ".[dev-no-ml]"
run: pip install -e . --group dev-no-ml
if: matrix.python-version == '3.10' || matrix.python-version == '3.11' || matrix.python-version == '3.12'

- name: Test with Pytest on Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -118,7 +118,7 @@ jobs:
cache: 'pip'

- name: Install dependencies
run: pip install -e ".[docs]"
run: pip install -e . --group docs

- name: Set up Git
run: |
Expand Down
29 changes: 0 additions & 29 deletions Makefile

This file was deleted.

4 changes: 4 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
- New `eds.explode` pipe that splits one document into multiple documents, one per span yielded by its `span_getter` parameter, each new document containing exactly that single span.
- New `Training a span classifier` tutorial, and reorganized deep-learning docs
- `ScheduledOptimizer` now warns when a parameter selector does not match any parameter.
- New trainable `eds.relation_detector_ffn` component to detect relations between entities. These relations are stored in each entity: `head._.rel[relation_label] = [tail1, tail2, ...]`.
- Load "Status" annotator notes as `status` dict attribute
- New `attention` pooling mode in
- Support different poolers for span embedding and inter-span embeddings in `eds.relation_detector_ffn`

## Fixed

Expand Down
4 changes: 2 additions & 2 deletions contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ $ python -m venv venv
$ source venv/bin/activate

# Install the package with common, dev, setup dependencies in editable mode
$ pip install -e '.[dev,setup]'
$ pip install -e . --group dev --group setup
# And build resources
$ python scripts/conjugate_verbs.py
```
Expand Down Expand Up @@ -113,7 +113,7 @@ We use `MkDocs` for EDS-NLP's documentation. You can checkout the changes you ma

```console
# Install the requirements
$ pip install -e '.[docs]'
$ pip install -e . --group docs
---> 100%
color:green Installation successful

Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/training-ner.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies = [
"sentencepiece>=0.1.96"
]

[project.optional-dependencies]
[dependency-groups]
dev = [
"dvc>=2.37.0; python_version >= '3.8'",
"pandas>=1.1.0,<2.0.0; python_version < '3.8'",
Expand All @@ -59,7 +59,7 @@ pip install uv
# skip the next two lines if you do not want a venv
uv venv .venv
source .venv/bin/activate
uv pip install -e ".[dev]" -p $(uv python find)
uv pip install -e . --group dev -p $(uv python find)
```

## Training the model
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/training-span-classifier.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies = [
"sentencepiece>=0.1.96"
]

[project.optional-dependencies]
[dependency-groups]
dev = [
"dvc>=2.37.0; python_version >= '3.8'",
"pandas>=1.4.0,<2.0.0; python_version >= '3.8'",
Expand All @@ -56,7 +56,7 @@ We recommend using a virtual environment and [uv](https://docs.astral.sh/uv/):
pip install uv
uv venv .venv
source .venv/bin/activate
uv pip install -e ".[dev]"
uv pip install -e . --group dev
```

## Creating the dataset
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies = [
"configobj>=5.0.9",
]

[project.optional-dependencies]
[dependency-groups]
dev = [
"dvc>=2.37.0; python_version >= '3.8'",
"pandas>=1.1.0,<2.0.0; python_version < '3.8'",
Expand All @@ -61,7 +61,7 @@ pip install uv
# skip the next two lines if you do not want a venv
uv venv .venv
source .venv/bin/activate
uv pip install -e ".[dev]" -p $(uv python find)
uv pip install -e . --group dev -p $(uv python find)
```

## 2. Tuning a model
Expand Down
157 changes: 91 additions & 66 deletions edsnlp/data/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,76 +243,101 @@ def __init__(

def __call__(self, obj, tokenizer=None):
# tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
tok = tokenizer or self.tokenizer or get_current_tokenizer()
doc = tok(obj["text"] or "")
doc._.note_id = obj.get("doc_id", obj.get(FILENAME))

spans = []

for dst in (
*(() if self.span_attributes is None else self.span_attributes.values()),
*self.default_attributes,
):
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)

for ent in obj.get("entities") or ():
fragments = (
[
{
"begin": min(f["begin"] for f in ent["fragments"]),
"end": max(f["end"] for f in ent["fragments"]),
}
]
if not self.split_fragments
else ent["fragments"]
)
for fragment in fragments:
span = doc.char_span(
fragment["begin"],
fragment["end"],
label=ent["label"],
alignment_mode="expand",
)
attributes = (
{a["label"]: a["value"] for a in ent["attributes"]}
if isinstance(ent["attributes"], list)
else ent["attributes"]
note_id = obj.get("doc_id", obj.get(FILENAME))
try:
tok = tokenizer or self.tokenizer or get_current_tokenizer()
doc = tok(obj["text"] or "")
doc._.note_id = note_id

entities = {}
spans = []

for dst in (
*(
()
if self.span_attributes is None
else self.span_attributes.values()
),
*self.default_attributes,
):
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)

for ent in obj.get("entities") or ():
fragments = (
[
{
"begin": min(f["begin"] for f in ent["fragments"]),
"end": max(f["end"] for f in ent["fragments"]),
}
]
if not self.split_fragments
else ent["fragments"]
)
if self.notes_as_span_attribute and ent["notes"]:
ent["attributes"][self.notes_as_span_attribute] = "|".join(
note["value"] for note in ent["notes"]
for fragment in fragments:
span = doc.char_span(
fragment["begin"],
fragment["end"],
label=ent["label"],
alignment_mode="expand",
)
for label, value in attributes.items():
new_name = (
self.span_attributes.get(label, None)
if self.span_attributes is not None
else label
attributes = (
{}
if "attributes" not in ent
else {a["label"]: a["value"] for a in ent["attributes"]}
if isinstance(ent["attributes"], list)
else ent["attributes"]
)
if self.span_attributes is None and not Span.has_extension(
new_name
):
Span.set_extension(new_name, default=None)

if new_name:
value = True if value is None else value
if not self.keep_raw_attribute_values:
value = (
True
if value in ("True", "true")
else False
if value in ("False", "false")
else value
)
span._.set(new_name, value)

spans.append(span)
if self.notes_as_span_attribute and ent["notes"]:
ent["attributes"][self.notes_as_span_attribute] = "|".join(
note["value"] for note in ent["notes"]
)
for label, value in attributes.items():
new_name = (
self.span_attributes.get(label, None)
if self.span_attributes is not None
else label
)
if self.span_attributes is None and not Span.has_extension(
new_name
):
Span.set_extension(new_name, default=None)

if new_name:
value = True if value is None else value
if not self.keep_raw_attribute_values:
value = (
True
if value in ("True", "true")
else False
if value in ("False", "false")
else value
)
span._.set(new_name, value)

entities.setdefault(ent["entity_id"], []).append(span)
spans.append(span)

set_spans(doc, spans, span_setter=self.span_setter)
for attr, value in self.default_attributes.items():
for span in spans:
if span._.get(attr) is None:
span._.set(attr, value)

for relation in obj.get("relations", []):
relation_label = (
relation["relation_label"]
if "relation_label" in relation
else relation["label"]
)
from_entity_id = relation["from_entity_id"]
to_entity_id = relation["to_entity_id"]

set_spans(doc, spans, span_setter=self.span_setter)
for attr, value in self.default_attributes.items():
for span in spans:
if span._.get(attr) is None:
span._.set(attr, value)
for head in entities.get(from_entity_id, ()):
for tail in entities.get(to_entity_id, ()):
head._.rel.setdefault(relation_label, set()).add(tail)
except Exception:
raise ValueError(f"Error when processing {note_id}")

return doc

Expand Down
34 changes: 21 additions & 13 deletions edsnlp/data/standoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+?) ([TE]\d+)(?: (.+))?$")
REGEX_EVENT = re.compile(r"^(E\d+)\t(.+)$")
REGEX_EVENT_PART = re.compile(r"(\S+):([TE]\d+)")
REGEX_STATUS = re.compile(r"^(#\d+)\tStatus ([^\t]+)\t(.*)$")


class BratParsingError(ValueError):
Expand Down Expand Up @@ -71,6 +72,7 @@ def parse_standoff_file(
entities = {}
relations = []
events = {}
doc = {}

with fs.open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
Expand Down Expand Up @@ -178,6 +180,11 @@ def parse_standoff_file(
"arguments": arguments,
}
elif line.startswith("#"):
match = REGEX_STATUS.match(line)
if match:
comment = match.group(3)
doc["status"] = comment
continue
match = REGEX_NOTE.match(line)
if match is None:
raise BratParsingError(ann_file, line)
Expand All @@ -201,6 +208,7 @@ def parse_standoff_file(
"entities": list(entities.values()),
"relations": relations,
"events": list(events.values()),
**doc,
}


Expand Down Expand Up @@ -260,19 +268,19 @@ def dump_standoff_file(
)
attribute_idx += 1

# fmt: off
# if "relations" in doc:
# for i, relation in enumerate(doc["relations"]):
# entity_from = entities_ids[relation["from_entity_id"]]
# entity_to = entities_ids[relation["to_entity_id"]]
# print(
# "R{}\t{} Arg1:{} Arg2:{}\t".format(
# i + 1, str(relation["label"]), entity_from,
# entity_to
# ),
# file=f,
# )
# fmt: on
# fmt: off
if "relations" in doc:
for i, relation in enumerate(doc["relations"]):
entity_from = entities_ids[relation["from_entity_id"]]
entity_to = entities_ids[relation["to_entity_id"]]
print(
"R{}\t{} Arg1:{} Arg2:{}\t".format(
i + 1, str(relation["label"]), entity_from,
entity_to
),
file=f,
)
# fmt: on


class StandoffReader(FileBasedReader):
Expand Down
Loading
Loading