Skip to content

Commit

Permalink
Merge pull request #3052 from flairNLP/gh-3034/fix-misaligned_spans
Browse files Browse the repository at this point in the history
fix label alignment if the sentence contains invalid tokens
  • Loading branch information
alanakbik authored Jan 23, 2023
2 parents 5829c54 + c4235c7 commit c777f45
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 2 deletions.
4 changes: 3 additions & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,6 @@ def _convert_lines_to_sentence(
# otherwise, this line is a token. parse and add to sentence
token = self._parse_token(line, word_level_tag_columns, token)
tokens.append(token)

sentence: Sentence = Sentence(text=tokens)

# check if this sentence is a document boundary
Expand All @@ -661,6 +660,9 @@ def _convert_lines_to_sentence(
bioes_tags = [
re.split(self.column_delimiter, line.rstrip())[span_column] for line in filtered_lines
]

# discard tags from tokens that are not added to the sentence
bioes_tags = [tag for tag, token in zip(bioes_tags, tokens) if token._internal_index is not None]
predicted_spans = get_spans_from_bio(bioes_tags)
for span_indices, score, label in predicted_spans:
span = sentence[span_indices[0] : span_indices[-1] + 1]
Expand Down
28 changes: 27 additions & 1 deletion tests/test_corpus_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import flair
from flair.data import Corpus, Dictionary, Label, Sentence
from flair.datasets import FlairDatapointDataset, SentenceDataset
from flair.datasets import ColumnCorpus, FlairDatapointDataset, SentenceDataset


def test_dictionary_get_items_with_unk():
Expand Down Expand Up @@ -288,3 +288,29 @@ def test_classification_corpus_multi_labels_with_negative_examples(tasks_base_pa
assert len(corpus.train) == 8
assert len(corpus.dev) == 5
assert len(corpus.test) == 6


def test_misalignment_spans(tasks_base_path):
example_txt = """George B-NAME
Washington I-NAME
went O
\t O
Washington B-CITY
and O
enjoyed O
some O
coffee B-BEVERAGE
"""
train_path = tasks_base_path / "tmp" / "train.txt"
try:
train_path.parent.mkdir(exist_ok=True, parents=True)
train_path.write_text(example_txt, encoding="utf-8")
corpus = ColumnCorpus(
data_folder=train_path.parent, column_format={0: "text", 1: "ner"}, train_file=train_path.name
)
sentence = corpus.train[0]
span_texts = [span.text for span in sentence.get_spans("ner")]
assert span_texts == ["George Washington", "Washington", "coffee"]
finally:
train_path.unlink()
train_path.parent.rmdir()

0 comments on commit c777f45

Please sign in to comment.