Merge pull request #3052 from flairNLP/gh-3034/fix-misaligned_spans

fix label alignment if the sentence contains invalid tokens
flairNLP · Jan 23, 2023 · c777f45 · c777f45
2 parents 5829c54 + c4235c7
commit c777f45
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 2 deletions.
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -647,7 +647,6 @@ def _convert_lines_to_sentence(
             # otherwise, this line is a token. parse and add to sentence
             token = self._parse_token(line, word_level_tag_columns, token)
             tokens.append(token)
-
         sentence: Sentence = Sentence(text=tokens)
 
         # check if this sentence is a document boundary
@@ -661,6 +660,9 @@ def _convert_lines_to_sentence(
                     bioes_tags = [
                         re.split(self.column_delimiter, line.rstrip())[span_column] for line in filtered_lines
                     ]
+
+                    # discard tags from tokens that are not added to the sentence
+                    bioes_tags = [tag for tag, token in zip(bioes_tags, tokens) if token._internal_index is not None]
                     predicted_spans = get_spans_from_bio(bioes_tags)
                     for span_indices, score, label in predicted_spans:
                         span = sentence[span_indices[0] : span_indices[-1] + 1]

diff --git a/tests/test_corpus_dictionary.py b/tests/test_corpus_dictionary.py
@@ -4,7 +4,7 @@
 
 import flair
 from flair.data import Corpus, Dictionary, Label, Sentence
-from flair.datasets import FlairDatapointDataset, SentenceDataset
+from flair.datasets import ColumnCorpus, FlairDatapointDataset, SentenceDataset
 
 
 def test_dictionary_get_items_with_unk():
@@ -288,3 +288,29 @@ def test_classification_corpus_multi_labels_with_negative_examples(tasks_base_pa
     assert len(corpus.train) == 8
     assert len(corpus.dev) == 5
     assert len(corpus.test) == 6
+
+
+def test_misalignment_spans(tasks_base_path):
+    example_txt = """George B-NAME
+Washington I-NAME
+went O
+\t O
+Washington B-CITY
+and O
+enjoyed O
+some O
+coffee B-BEVERAGE
+"""
+    train_path = tasks_base_path / "tmp" / "train.txt"
+    try:
+        train_path.parent.mkdir(exist_ok=True, parents=True)
+        train_path.write_text(example_txt, encoding="utf-8")
+        corpus = ColumnCorpus(
+            data_folder=train_path.parent, column_format={0: "text", 1: "ner"}, train_file=train_path.name
+        )
+        sentence = corpus.train[0]
+        span_texts = [span.text for span in sentence.get_spans("ner")]
+        assert span_texts == ["George Washington", "Washington", "coffee"]
+    finally:
+        train_path.unlink()
+        train_path.parent.rmdir()