Skip to content

Commit

Permalink
fix label alignment if the sentence contains invalid tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
Benedikt Fuchs authored and helpmefindaname committed Jan 19, 2023
1 parent 1577601 commit 5c6dc99
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
2 changes: 1 addition & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,6 @@ def _convert_lines_to_sentence(
# otherwise, this line is a token. parse and add to sentence
token = self._parse_token(line, word_level_tag_columns, token)
tokens.append(token)

sentence: Sentence = Sentence(text=tokens)

# check if this sentence is a document boundary
Expand All @@ -661,6 +660,7 @@ def _convert_lines_to_sentence(
bioes_tags = [
re.split(self.column_delimiter, line.rstrip())[span_column] for line in filtered_lines
]
bioes_tags = [tag for tag, token in zip(bioes_tags, tokens) if token._internal_index is not None]
predicted_spans = get_spans_from_bio(bioes_tags)
for span_indices, score, label in predicted_spans:
span = sentence[span_indices[0] : span_indices[-1] + 1]
Expand Down
28 changes: 27 additions & 1 deletion tests/test_corpus_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import flair
from flair.data import Corpus, Dictionary, Label, Sentence
from flair.datasets import FlairDatapointDataset, SentenceDataset
from flair.datasets import FlairDatapointDataset, SentenceDataset, ColumnCorpus


def test_dictionary_get_items_with_unk():
Expand Down Expand Up @@ -288,3 +288,29 @@ def test_classification_corpus_multi_labels_with_negative_examples(tasks_base_pa
assert len(corpus.train) == 8
assert len(corpus.dev) == 5
assert len(corpus.test) == 6


def test_misalignment_spans(tasks_base_path):
example_txt = """George B-NAME
Washington I-NAME
went O
\t O
Washington B-CITY
and O
enjoyed O
some O
coffee B-BEVERAGE
"""
train_path = tasks_base_path / "tmp" / "train.txt"
try:
train_path.parent.mkdir(exist_ok=True, parents=True)
train_path.write_text(example_txt, encoding="utf-8")
corpus = ColumnCorpus(
data_folder=train_path.parent, column_format={0: "text", 1: "ner"}, train_file=train_path.name
)
sentence = corpus.train[0]
span_texts = [span.text for span in sentence.get_spans("ner")]
assert span_texts == ["George Washington", "Washington", "coffee"]
finally:
train_path.unlink(missing_ok=True)
train_path.parent.rmdir()

0 comments on commit 5c6dc99

Please sign in to comment.