1
+ import logging
1
2
from udapi .core .block import Block
2
3
3
4
class Link2Cluster (Block ):
4
5
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
5
6
6
- def __init__ (self , id_attr = 'external -id' , ante_attr = 'antecedent-id' , ** kwargs ):
7
+ def __init__ (self , id_attr = 'proiel -id' , ante_attr = 'antecedent-proiel- id' , delete_orig_attrs = True , ** kwargs ):
7
8
super ().__init__ (** kwargs )
8
9
self .id_attr = id_attr
9
- self .ante_id = ante_attr
10
+ self .ante_attr = ante_attr
11
+ self .delete_orig_attrs = delete_orig_attrs
10
12
11
13
def process_document (self , doc ):
12
14
id2node = {}
@@ -18,20 +20,38 @@ def process_document(self, doc):
18
20
ante_id = node .misc [self .ante_attr ]
19
21
if ante_id != '' :
20
22
links .append ([ante_id , this_id ])
23
+ if self .delete_orig_attrs :
24
+ for attr in (self .id_attr , self .ante_attr ):
25
+ del node .misc [attr ]
26
+
27
+ for link in links :
28
+ if link [0 ] not in id2node :
29
+ logging .warning (f"{ link [0 ]} is referenced in { self .ante_attr } , but not in { self .id_attr } " )
30
+ links = [link for link in links if link [0 ] in id2node ]
31
+
32
+ # nodeA < nodeB is a shortcut for nodeA.ord < nodeB.ord
33
+ # but here we need to sort nodes from different sentences,
34
+ # so we need to compare first the bundle number and then node.ord.
35
+ sort_key = lambda node : (node .root .bundle .number , node .ord )
21
36
22
37
# sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this.
23
- node_links = [sorted ([id2node [link [0 ]], id2node [link [1 ]]], reverse = True ) for link in links ]
38
+ node_links = [sorted ([id2node [link [0 ]], id2node [link [1 ]]], reverse = True , key = sort_key ) for link in links ]
24
39
25
- # sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent).
26
- node_links .sort ()
40
+ # Makes sure the links are sorted by this_node (i.e. the anaphor, not the antecendent).
41
+ node_links .sort (key = lambda link : sort_key ( link [ 0 ]) )
27
42
28
43
# Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating
29
44
# and we can prevent the need for merging two entities.
30
45
for this_node , ante_node in node_links :
31
- assert not this_node .mentions
32
- if ante_node .mentions :
33
- ante_node .entities [0 ].create_mention (head = this_node , words = [this_node ])
46
+ assert not this_node .coref_mentions
47
+ if ante_node .coref_mentions :
48
+ ante_node .coref_entities [0 ].create_mention (head = this_node , words = [this_node ])
34
49
else :
35
50
entity = this_node .root .document .create_coref_entity ()
36
- entity .create_mention (head = ante_node , words = [ante_node ])
37
- entity .create_mention (head = this_node , words = [this_node ])
51
+ m_ante = entity .create_mention (head = ante_node , words = [ante_node ])
52
+ m_this = entity .create_mention (head = this_node , words = [this_node ])
53
+ for node , mention in ((ante_node , m_ante ), (this_node , m_this )):
54
+ if node .misc ['information-status' ]:
55
+ mention .other ['infstat' ] = node .misc ['information-status' ]
56
+ if self .delete_orig_attrs :
57
+ del node .misc ['information-status' ]
0 commit comments