Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for v2.0 of HIPE-2022 data #2684

Merged
merged 10 commits into from
Mar 25, 2022
23 changes: 18 additions & 5 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4136,7 +4136,8 @@ def __init__(
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
version: str = "v1.0",
version: str = "v2.0",
branch_name: str = "main",
dev_split_name="dev",
add_document_separator=False,
sample_missing_splits=False,
Expand All @@ -4152,6 +4153,7 @@ def __init__(
:tag_to_bioes: Dataset will automatically transformed into BIOES format (internally).
:in_memory: If True, keeps dataset in memory giving speedups in training.
:version: Version of CLEF-HIPE dataset. Currently only v1.0 is supported and available.
:branch_name: Defines git branch name of HIPE data repository (main by default).
:dev_split_name: Defines default name of development split (dev by default). Only the NewsEye dataset has
currently two development splits: dev and dev2.
:add_document_separator: If True, a special document seperator will be introduced. This is highly
Expand Down Expand Up @@ -4180,19 +4182,25 @@ def __init__(
}
}

# v2.0 only adds new language and splits for AJMC dataset
hipe_available_splits["v2.0"] = hipe_available_splits.get("v1.0").copy()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stefan-it mypy complains due to this line, causing the unit tests to fail. The 'error' is printed at the end of the test output:

mypy exited with status 1.
_____________________ flair/datasets/sequence_labeling.py ______________________
4186: error: Item "None" of "Optional[Dict[str, Dict[str, List[str]]]]" has no attribute "copy"
===================================== mypy =====================================

It seems there is some problem with the copy() here. Perhaps it can be removed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, thanks! copy is necessary here, because otherwise v1.0 values would be changed, when changing v2.0 variables. The .get() method returns an optional causing mypy to fail here, so I used the normal index access now. CI is green then 🤗

hipe_available_splits["v2.0"]["ajmc"] = {"de": ["train", "dev"], "en": ["train", "dev"], "fr": ["train", "dev"]}

eos_marker = "EndOfSentence"
document_separator = "# hipe2022:document_id"

# Special document marker for sample splits in AJMC dataset
if f"{version}/{dataset_name}" == "v1.0/ajmc":
if f"{dataset_name}" == "ajmc":
document_separator = "# hipe2022:original_source"

columns = {0: "text", 1: "ner"}

dataset_base = self.__class__.__name__.lower()
data_folder = base_path / dataset_base / dataset_name / language
data_folder = base_path / dataset_base / version / dataset_name / language

data_url = f"https://github.com/hipe-eval/HIPE-2022-data/raw/main/data/{version}/{dataset_name}/{language}"
data_url = (
f"https://github.com/hipe-eval/HIPE-2022-data/raw/{branch_name}/data/{version}/{dataset_name}/{language}"
)

dataset_splits = hipe_available_splits[version][dataset_name][language]

Expand Down Expand Up @@ -4234,6 +4242,7 @@ def __init__(
in_memory=in_memory,
document_separator_token="-DOCSTART-",
skip_first_line=True,
column_delimiter="\t",
comment_symbol="# ",
sample_missing_splits=sample_missing_splits,
**corpusargs,
Expand All @@ -4251,7 +4260,11 @@ def __prepare_corpus(
f_out.write(lines[0] + "\n")

for line in lines[1:]:
line = line.rstrip()
if line.startswith(" \t"):
# Workaround for empty tokens
continue

line = line.strip()

# Add "real" document marker
if add_document_separator and line.startswith(document_separator):
Expand Down
154 changes: 129 additions & 25 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,44 +373,119 @@ def test_hipe_2022_corpus(tasks_base_path):
# We have manually checked, that these numbers are correct:
hipe_stats = {
"v1.0": {
"ajmc": {"de": {"sample": {"sents": 119, "docs": 8}}, "en": {"sample": {"sents": 83, "docs": 5}}},
"ajmc": {
"de": {"sample": {"sents": 119, "docs": 8, "labels": ["date", "loc", "pers", "scope", "work"]}},
"en": {"sample": {"sents": 83, "docs": 5, "labels": ["date", "loc", "pers", "scope", "work"]}},
},
"hipe2020": {
"de": {
"train": {"sents": 3470 + 2, "docs": 103}, # 2 sentences with missing EOS marker
"dev": {
"sents": 1202,
"docs": 33,
"train": {
"sents": 3470 + 2, # 2 sentences with missing EOS marker
"docs": 103,
"labels": ["loc", "org", "pers", "prod", "time"],
},
"dev": {"sents": 1202, "docs": 33, "labels": ["loc", "org", "pers", "prod", "time"]},
},
"en": {"dev": {"sents": 1045, "docs": 80, "labels": ["loc", "org", "pers", "prod", "time"]}},
"fr": {
"train": {"sents": 5743, "docs": 158, "labels": ["loc", "org", "pers", "prod", "time", "comp"]},
"dev": {"sents": 1244, "docs": 43, "labels": ["loc", "org", "pers", "prod", "time"]},
},
"en": {"dev": {"sents": 1045, "docs": 80}},
"fr": {"train": {"sents": 5743, "docs": 158}, "dev": {"sents": 1244, "docs": 43}},
},
"letemps": {"fr": {"train": {"sents": 14051, "docs": 414}, "dev": {"sents": 1341, "docs": 51}}},
"letemps": {
"fr": {
"train": {"sents": 14051, "docs": 414, "labels": ["loc", "org", "pers"]},
"dev": {"sents": 1341, "docs": 51, "labels": ["loc", "org", "pers"]},
}
},
"newseye": {
# +1 offset, because of missing EOS marker at EOD
"de": {
"train": {"sents": 23646 + 1, "docs": 11},
"dev": {"sents": 1110 + 1, "docs": 12},
"dev2": {"sents": 1541 + 1, "docs": 12},
"train": {"sents": 23646 + 1, "docs": 11, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev": {"sents": 1110 + 1, "docs": 12, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev2": {"sents": 1541 + 1, "docs": 12, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
},
"fi": {
"train": {"sents": 1141 + 1, "docs": 24},
"dev": {"sents": 140 + 1, "docs": 24},
"dev2": {"sents": 104 + 1, "docs": 21},
"train": {"sents": 1141 + 1, "docs": 24, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev": {"sents": 140 + 1, "docs": 24, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev2": {"sents": 104 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
},
"fr": {
"train": {"sents": 7106 + 1, "docs": 35},
"dev": {"sents": 662 + 1, "docs": 35},
"dev2": {"sents": 1016 + 1, "docs": 35},
"train": {"sents": 7106 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev": {"sents": 662 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev2": {"sents": 1016 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
},
"sv": {
"train": {"sents": 1063 + 1, "docs": 21},
"dev": {"sents": 126 + 1, "docs": 21},
"dev2": {"sents": 136 + 1, "docs": 21},
"train": {"sents": 1063 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev": {"sents": 126 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
"dev2": {"sents": 136 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]},
},
},
"sonar": {"de": {"dev": {"sents": 1603 + 10, "docs": 10}}}, # 10 sentences with missing EOS marker
"topres19th": {"en": {"train": {"sents": 5874, "docs": 309}, "dev": {"sents": 646, "docs": 34}}},
"sonar": {
"de": {
"dev": {
"sents": 1603 + 10, # 10 sentences with missing EOS marker
"docs": 10,
"labels": ["LOC", "ORG", "PER"],
}
}
},
"topres19th": {
"en": {
"train": {"sents": 5874, "docs": 309, "labels": ["BUILDING", "LOC", "STREET"]},
"dev": {"sents": 646, "docs": 34, "labels": ["BUILDING", "LOC", "STREET"]},
}
},
}
}

hipe_stats["v2.0"] = hipe_stats.get("v1.0").copy()
hipe_stats["v2.0"]["ajmc"] = {
"de": {
"train": {
"sents": 1022 + 2, # 2 sentences with missing EOS marker
"docs": 76,
"labels": ["date", "loc", "object", "pers", "scope", "work"],
},
"dev": {"sents": 192, "docs": 14, "labels": ["loc", "object", "pers", "scope", "work"]},
},
"en": {
"train": {
"sents": 1153 + 1, # 1 sentence with missing EOS marker
"docs": 60,
"labels": ["date", "loc", "object", "pers", "scope", "work"],
},
"dev": {
"sents": 251 + 1, # 1 sentence with missing EOS marker
"docs": 14,
"labels": ["date", "loc", "pers", "scope", "work"],
},
},
"fr": {
"train": {
"sents": 893 + 1, # 1 sentence with missing EOS marker
"docs": 72,
"labels": ["date", "loc", "object", "pers", "scope", "work"],
},
"dev": {
"sents": 201 + 1, # 1 sentence with missing EOS marker
"docs": 17,
"labels": ["pers", "scope", "work"],
},
},
}
hipe_stats["v2.0"]["newseye"] = {
"de": {
"train": {"sents": 20839 + 1, "docs": 7, "labels": ["HumanProd", "LOC", "ORG", "PER"]} # missing EOD marker
}
}
hipe_stats["v2.0"]["sonar"] = {
"de": {
"dev": {
"sents": 816 + 10, # 9 sentences with missing EOS marker + missing EOD
"docs": 10,
"labels": ["LOC", "ORG", "PER"],
}
}
}

Expand All @@ -420,6 +495,7 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True):
splits = languages[language]

corpus = flair.datasets.NER_HIPE_2022(
version=dataset_version,
dataset_name=dataset_name,
language=language,
dev_split_name="dev",
Expand All @@ -429,30 +505,58 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True):
for split_name, stats in splits.items():
split_description = f"{dataset_name}/{language}@{split_name}"

total_sentences = sum(stats.values()) if add_document_separator else stats["sents"]
current_sents = stats["sents"]
current_docs = stats["docs"]
current_labels = set(stats["labels"] + ["<unk>"])

total_sentences = current_sents + current_docs if add_document_separator else stats["sents"]

if split_name == "train":
assert (
len(corpus.train) == total_sentences
), f"Sentence count mismatch for {split_description}: {len(corpus.train)} vs. {total_sentences}"

gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items())

assert (
current_labels == gold_labels
), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}"

elif split_name in ["dev", "sample"]:
assert (
len(corpus.dev) == total_sentences
), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}"

corpus._train = corpus._dev
gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items())

assert (
current_labels == gold_labels
), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}"
elif split_name == "dev2":
corpus = flair.datasets.NER_HIPE_2022(
version=dataset_version,
dataset_name=dataset_name,
language=language,
dev_split_name="dev2",
add_document_separator=add_document_separator,
)

corpus._train = corpus._dev
gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items())

assert (
len(corpus.dev) == total_sentences
), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}"

test_hipe_2022(add_document_separator=True)
test_hipe_2022(add_document_separator=False)
assert (
current_labels == gold_labels
), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}"

test_hipe_2022(dataset_version="v1.0", add_document_separator=True)
test_hipe_2022(dataset_version="v1.0", add_document_separator=False)
test_hipe_2022(dataset_version="v2.0", add_document_separator=True)
test_hipe_2022(dataset_version="v2.0", add_document_separator=False)


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
Expand Down