Skip to content

Commit 3f9dd84

Browse files
committed
corefud.Link2Cluster prepared to convert PROIEL files
1 parent a5f5d90 commit 3f9dd84

File tree

2 files changed

+32
-12
lines changed

2 files changed

+32
-12
lines changed

udapi/block/corefud/link2cluster.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1+
import logging
12
from udapi.core.block import Block
23

34
class Link2Cluster(Block):
45
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
56

6-
def __init__(self, id_attr='external-id', ante_attr='antecedent-id', **kwargs):
7+
def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs):
78
super().__init__(**kwargs)
89
self.id_attr = id_attr
9-
self.ante_id = ante_attr
10+
self.ante_attr = ante_attr
11+
self.delete_orig_attrs = delete_orig_attrs
1012

1113
def process_document(self, doc):
1214
id2node = {}
@@ -18,20 +20,38 @@ def process_document(self, doc):
1820
ante_id = node.misc[self.ante_attr]
1921
if ante_id != '':
2022
links.append([ante_id, this_id])
23+
if self.delete_orig_attrs:
24+
for attr in (self.id_attr, self.ante_attr):
25+
del node.misc[attr]
26+
27+
for link in links:
28+
if link[0] not in id2node:
29+
logging.warning(f"{link[0]} is referenced in {self.ante_attr}, but not in {self.id_attr}")
30+
links = [link for link in links if link[0] in id2node]
31+
32+
# nodeA < nodeB is a shortcut for nodeA.ord < nodeB.ord
33+
# but here we need to sort nodes from different sentences,
34+
# so we need to compare first the bundle number and then node.ord.
35+
sort_key = lambda node: (node.root.bundle.number, node.ord)
2136

2237
# sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this.
23-
node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True) for link in links]
38+
node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True, key=sort_key) for link in links]
2439

25-
# sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent).
26-
node_links.sort()
40+
# Makes sure the links are sorted by this_node (i.e. the anaphor, not the antecendent).
41+
node_links.sort(key=lambda link: sort_key(link[0]))
2742

2843
# Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating
2944
# and we can prevent the need for merging two entities.
3045
for this_node, ante_node in node_links:
31-
assert not this_node.mentions
32-
if ante_node.mentions:
33-
ante_node.entities[0].create_mention(head=this_node, words=[this_node])
46+
assert not this_node.coref_mentions
47+
if ante_node.coref_mentions:
48+
ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node])
3449
else:
3550
entity = this_node.root.document.create_coref_entity()
36-
entity.create_mention(head=ante_node, words=[ante_node])
37-
entity.create_mention(head=this_node, words=[this_node])
51+
m_ante = entity.create_mention(head=ante_node, words=[ante_node])
52+
m_this = entity.create_mention(head=this_node, words=[this_node])
53+
for node, mention in ((ante_node, m_ante), (this_node, m_this)):
54+
if node.misc['information-status']:
55+
mention.other['infstat'] = node.misc['information-status']
56+
if self.delete_orig_attrs:
57+
del node.misc['information-status']

udapi/core/document.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,9 @@ def create_coref_entity(self, eid=None, etype=None):
159159
self._load_coref()
160160
if not eid:
161161
counter = 1
162-
while self._eid_to_entity.get(f'c{counter}'):
162+
while self._eid_to_entity.get(f'e{counter}'):
163163
counter += 1
164-
eid = f'c{counter}'
164+
eid = f'e{counter}'
165165
elif self._eid_to_entity.get(eid):
166166
raise ValueError("Entity with eid=%s already exists", eid)
167167
entity = udapi.core.coref.CorefEntity(eid, etype)

0 commit comments

Comments
 (0)