diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 0bfb8e4bd..b1b89d32a 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -647,7 +647,6 @@ def _convert_lines_to_sentence( # otherwise, this line is a token. parse and add to sentence token = self._parse_token(line, word_level_tag_columns, token) tokens.append(token) - sentence: Sentence = Sentence(text=tokens) # check if this sentence is a document boundary @@ -661,6 +660,9 @@ def _convert_lines_to_sentence( bioes_tags = [ re.split(self.column_delimiter, line.rstrip())[span_column] for line in filtered_lines ] + + # discard tags from tokens that are not added to the sentence + bioes_tags = [tag for tag, token in zip(bioes_tags, tokens) if token._internal_index is not None] predicted_spans = get_spans_from_bio(bioes_tags) for span_indices, score, label in predicted_spans: span = sentence[span_indices[0] : span_indices[-1] + 1] diff --git a/tests/test_corpus_dictionary.py b/tests/test_corpus_dictionary.py index 2addfaa39..aee080fe9 100644 --- a/tests/test_corpus_dictionary.py +++ b/tests/test_corpus_dictionary.py @@ -4,7 +4,7 @@ import flair from flair.data import Corpus, Dictionary, Label, Sentence -from flair.datasets import FlairDatapointDataset, SentenceDataset +from flair.datasets import ColumnCorpus, FlairDatapointDataset, SentenceDataset def test_dictionary_get_items_with_unk(): @@ -288,3 +288,29 @@ def test_classification_corpus_multi_labels_with_negative_examples(tasks_base_pa assert len(corpus.train) == 8 assert len(corpus.dev) == 5 assert len(corpus.test) == 6 + + +def test_misalignment_spans(tasks_base_path): + example_txt = """George B-NAME +Washington I-NAME +went O +\t O +Washington B-CITY +and O +enjoyed O +some O +coffee B-BEVERAGE +""" + train_path = tasks_base_path / "tmp" / "train.txt" + try: + train_path.parent.mkdir(exist_ok=True, parents=True) + train_path.write_text(example_txt, encoding="utf-8") + corpus = ColumnCorpus( + data_folder=train_path.parent, column_format={0: "text", 1: "ner"}, train_file=train_path.name + ) + sentence = corpus.train[0] + span_texts = [span.text for span in sentence.get_spans("ner")] + assert span_texts == ["George Washington", "Washington", "coffee"] + finally: + train_path.unlink() + train_path.parent.rmdir()