corefud.Link2Cluster prepared to convert PROIEL files

martinpopel · martinpopel · commit 3f9dd8417c7b · 2024-02-01T02:01:36.000+01:00
diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py
@@ -1,12 +1,14 @@
+import logging
 from udapi.core.block import Block
 
 class Link2Cluster(Block):
     """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
 
-    def __init__(self, id_attr='external-id', ante_attr='antecedent-id', **kwargs):
+    def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs):
         super().__init__(**kwargs)
         self.id_attr = id_attr
-        self.ante_id = ante_attr
+        self.ante_attr = ante_attr
+        self.delete_orig_attrs = delete_orig_attrs
 
     def process_document(self, doc):
         id2node = {}
@@ -18,20 +20,38 @@ def process_document(self, doc):
                 ante_id = node.misc[self.ante_attr]
                 if ante_id != '':
                     links.append([ante_id, this_id])
+                if self.delete_orig_attrs:
+                    for attr in (self.id_attr, self.ante_attr):
+                        del node.misc[attr]
+
+        for link in links:
+            if link[0] not in id2node:
+                logging.warning(f"{link[0]} is referenced in {self.ante_attr}, but not in {self.id_attr}")               
+        links = [link for link in links if link[0] in id2node]        
+
+        # nodeA < nodeB is a shortcut for nodeA.ord < nodeB.ord
+        # but here we need to sort nodes from different sentences,
+        # so we need to compare first the bundle number and then node.ord.
+        sort_key = lambda node: (node.root.bundle.number, node.ord)
 
         # sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this.
-        node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True) for link in links]
+        node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True, key=sort_key) for link in links]
         
-        # sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent).
-        node_links.sort()
+        # Makes sure the links are sorted by this_node (i.e. the anaphor, not the antecendent).
+        node_links.sort(key=lambda link: sort_key(link[0]))
         
         # Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating
         # and we can prevent the need for merging two entities.
         for this_node, ante_node in node_links:
-            assert not this_node.mentions
-            if ante_node.mentions:
-                ante_node.entities[0].create_mention(head=this_node, words=[this_node])
+            assert not this_node.coref_mentions
+            if ante_node.coref_mentions:
+                ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node])
             else:
                 entity = this_node.root.document.create_coref_entity()
-                entity.create_mention(head=ante_node, words=[ante_node])
-                entity.create_mention(head=this_node, words=[this_node])
+                m_ante = entity.create_mention(head=ante_node, words=[ante_node])
+                m_this = entity.create_mention(head=this_node, words=[this_node])
+                for node, mention in ((ante_node, m_ante), (this_node, m_this)):
+                    if node.misc['information-status']:
+                        mention.other['infstat'] = node.misc['information-status']
+                        if self.delete_orig_attrs:
+                            del node.misc['information-status']
diff --git a/udapi/core/document.py b/udapi/core/document.py
@@ -159,9 +159,9 @@ def create_coref_entity(self, eid=None, etype=None):
         self._load_coref()
         if not eid:
             counter = 1
-            while self._eid_to_entity.get(f'c{counter}'):
+            while self._eid_to_entity.get(f'e{counter}'):
                 counter += 1
-            eid = f'c{counter}'
+            eid = f'e{counter}'
         elif self._eid_to_entity.get(eid):
             raise ValueError("Entity with eid=%s already exists", eid)
         entity = udapi.core.coref.CorefEntity(eid, etype)