Skip to content

Commit bb9e553

Browse files
committed
improve udpipe.Base online=1
- bugfix: `model` was ignored when tokenize=0 - allow more combinations, e.g. tokenize=0 tag=1 parse=0 or tokenize=0 tag=0 parse=1 where the existing tags/parses are reused - for redoing the tokenization, you can use tokenize=1 delete_nodes=1 which first deletes the existing nodes and then creates them again using UDPipe's tokenizer
1 parent ed822b8 commit bb9e553

File tree

2 files changed

+32
-14
lines changed

2 files changed

+32
-14
lines changed

udapi/block/udpipe/base.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,14 @@ class Base(Block):
120120

121121
# pylint: disable=too-many-arguments
122122
def __init__(self, model=None, model_alias=None, online=False,
123-
tokenize=True, tag=True, parse=True, resegment=False, **kwargs):
123+
tokenize=True, tag=True, parse=True, resegment=False,
124+
delete_nodes=False, **kwargs):
124125
"""Create the udpipe.En block object."""
125126
super().__init__(**kwargs)
126127
self.model, self.model_alias, self.online = model, model_alias, online
127128
self._tool = None
128129
self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment
130+
self.delete_nodes = delete_nodes
129131

130132
@property
131133
def tool(self):
@@ -146,16 +148,19 @@ def tool(self):
146148
return self._tool
147149

148150
def process_document(self, doc):
149-
tok, tag, par = self.tokenize, self.tag, self.parse
151+
tok, tag, par, reseg = self.tokenize, self.tag, self.parse, self.resegment
150152
old_bundles = doc.bundles
151153
new_bundles = []
152154
for bundle in old_bundles:
153155
for tree in bundle:
154156
new_bundles.append(bundle)
155157
if self._should_process_tree(tree):
158+
if self.delete_nodes:
159+
for subroot in tree.children:
160+
subroot.remove()
156161
if tok:
157-
new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment,
158-
tag=self.tag, parse=self.parse)
162+
new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg,
163+
tag=tag, parse=par)
159164
if self.resegment and len(new_trees) > 1:
160165
orig_bundle_id = bundle.bundle_id
161166
bundle.bundle_id = orig_bundle_id + '-1'
@@ -164,9 +169,9 @@ def process_document(self, doc):
164169
new_tree.zone = tree.zone
165170
new_bundle.add_tree(new_tree)
166171
new_bundles.append(new_bundle)
167-
elif not tok and tag and par:
168-
self.tool.tag_parse_tree(tree)
169-
elif not tok and not tag and not par and self.resegment:
172+
elif not tok and not reseg and (tag or par):
173+
self.tool.tag_parse_tree(tree, tag=tag, parse=par)
174+
elif not tok and reseg and not tag and not par:
170175
sentences = self.tool.segment_text(tree.text)
171176
if len(sentences) > 1:
172177
orig_bundle_id = bundle.bundle_id
@@ -178,7 +183,7 @@ def process_document(self, doc):
178183
new_tree.text = sentence
179184
new_bundles.append(new_bundle)
180185
else:
181-
raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par))
186+
raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}")
182187
doc.bundles = new_bundles
183188

184189
'''

udapi/tool/udpipeonline.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,21 +62,34 @@ def perform_request(self, params, method="process"):
6262

6363
return response["result"]
6464

65-
def tag_parse_tree(self, root):
65+
def tag_parse_tree(self, root, tag=True, parse=True):
6666
"""Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
67+
if not tag and not parse:
68+
raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.')
6769
descendants = root.descendants
6870
if not descendants:
6971
return
7072
in_data = " ".join([n.form for n in descendants])
71-
out_data = self.perform_request(params={"data": in_data, "input":"horizontal", "tagger":"", "parser":""})
72-
conllu_reader = ConlluReader()
73+
params = {"model": self.model, "data": in_data, "input":"horizontal", "tagger":""}
74+
if tag:
75+
attrs = 'upos xpos lemma feats'.split()
76+
else:
77+
attrs = []
78+
if parse:
79+
params["parser"] = ""
80+
attrs.append('deprel')
81+
82+
out_data = self.perform_request(params=params)
83+
conllu_reader = ConlluReader(empty_parent="ignore")
7384
conllu_reader.files.filehandle = io.StringIO(out_data)
7485
parsed_root = conllu_reader.read_tree()
75-
root.flatten()
86+
if parse:
87+
root.flatten()
7688
for parsed_node in parsed_root.descendants:
7789
node = descendants[parsed_node.ord - 1]
78-
node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root
79-
for attr in 'upos xpos lemma feats deprel'.split():
90+
if parse:
91+
node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root
92+
for attr in attrs:
8093
setattr(node, attr, getattr(parsed_node, attr))
8194

8295
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True):

0 commit comments

Comments
 (0)