Skip to content

Commit 2929fb4

Browse files
committed
implement mwt.feats
fix #132
1 parent 30d59d4 commit 2929fb4

File tree

4 files changed

+30
-8
lines changed

4 files changed

+30
-8
lines changed

udapi/block/read/conllu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,6 @@ def read_tree_from_lines(self, lines):
223223
logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}")
224224
raise
225225
words = nodes[int(range_start):int(range_end) + 1]
226-
root.create_multiword_token(words, form=fields[1], misc=fields[-1])
226+
root.create_multiword_token(words, form=fields[1], feats=fields[5], misc=fields[9])
227227

228228
return root

udapi/block/write/conllu.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,9 @@ def process_tree(self, tree): # pylint: disable=too-many-branches
117117
if mwt and node._ord > last_mwt_id:
118118
print('\t'.join((mwt.ord_range,
119119
'_' if mwt.form is None else mwt.form,
120-
'_\t_\t_\t_\t_\t_\t_',
120+
'_\t_\t_',
121+
'_' if mwt._feats is None else str(mwt.feats),
122+
'_\t_\t_',
121123
'_' if mwt._misc is None else str(mwt.misc))))
122124
last_mwt_id = mwt.words[-1]._ord
123125

udapi/core/mwt.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,38 @@
11
"""MWT class represents a multi-word token."""
22
from udapi.core.dualdict import DualDict
3-
3+
from udapi.core.feats import Feats
44

55
class MWT(object):
66
"""Class for representing multi-word tokens in UD trees."""
7-
__slots__ = ['words', 'form', '_misc', 'root']
7+
__slots__ = ['words', 'form', '_feats', '_misc', 'root']
88

9-
def __init__(self, words=None, form=None, misc=None, root=None):
9+
def __init__(self, words=None, form=None, feats=None, misc=None, root=None):
1010
self.words = words if words is not None else []
1111
self.form = form
12+
self._feats = Feats(feats) if feats and feats != '_' else None
1213
self._misc = DualDict(misc) if misc and misc != '_' else None
1314
self.root = root
1415
for word in self.words:
1516
word._mwt = self # pylint: disable=W0212
1617

18+
@property
19+
def feats(self):
20+
"""Property `feats` in MWT should be used only for `Typo=Yes`.
21+
22+
See https://universaldependencies.org/changes.html#typos-in-multiword-tokens
23+
However, Udapi does not enforce this restriction and mwt.feats works exactly the same as node.feats.
24+
"""
25+
if self._feats is None:
26+
self._feats = Feats()
27+
return self._feats
28+
29+
@feats.setter
30+
def feats(self, value):
31+
if self._feats is None:
32+
self._feats = Feats(value)
33+
else:
34+
self._feats.set_mapping(value)
35+
1736
@property
1837
def misc(self):
1938
"""Property for MISC attributes stored as a `DualDict` object.

udapi/core/root.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,22 +167,23 @@ def create_empty_child(self, **kwargs):
167167
return new_node
168168

169169
# TODO document whether misc is a string or dict or it can be both
170-
def create_multiword_token(self, words=None, form=None, misc=None):
170+
def create_multiword_token(self, words=None, form=None, feats=None, misc=None):
171171
"""Create and return a new multi-word token (MWT) in this tree.
172172
173173
The new MWT can be optionally initialized using the following args.
174174
Args:
175175
words: a list of nodes which are part of the new MWT
176176
form: string representing the surface form of the new MWT
177-
misc: misc attribute of the new MWT
177+
misc: FEATS attribute of the new MWT (only `Typo=Yes` allowed there in UD guidelines)
178+
misc: MISC attribute of the new MWT
178179
"""
179180
# Nested or overlapping MWTs are not allowed in CoNLL-U,
180181
# so first remove all previous MWTs containing any of words.
181182
for w in words:
182183
if w.multiword_token:
183184
w.multiword_token.remove()
184185
# Now, create the new MWT.
185-
mwt = MWT(words, form, misc, root=self)
186+
mwt = MWT(words, form, feats, misc, root=self)
186187
self._mwts.append(mwt)
187188
if words[-1].misc["SpaceAfter"] == "No":
188189
mwt.misc["SpaceAfter"] = "No"

0 commit comments

Comments
 (0)