From d242ebe264bace6ff10c1d83b40e78e1c925c910 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 21 Mar 2022 16:48:36 +0100 Subject: [PATCH 01/10] datasets: add support for v2 of HIPE-2022 dataset --- flair/datasets/sequence_labeling.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 18bd46be9..5db9bdf41 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4136,7 +4136,8 @@ def __init__( base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - version: str = "v1.0", + version: str = "v2.0", + branch_name: str = "release-v2.0", dev_split_name="dev", add_document_separator=False, sample_missing_splits=False, @@ -4152,6 +4153,7 @@ def __init__( :tag_to_bioes: Dataset will automatically transformed into BIOES format (internally). :in_memory: If True, keeps dataset in memory giving speedups in training. :version: Version of CLEF-HIPE dataset. Currently only v1.0 is supported and available. + :branch_name: Defines git branch name of HIPE data repository (main by default). :dev_split_name: Defines default name of development split (dev by default). Only the NewsEye dataset has currently two development splits: dev and dev2. :add_document_separator: If True, a special document seperator will be introduced. This is highly @@ -4180,19 +4182,25 @@ def __init__( } } + # v2.0 only adds new language and splits for AJMC dataset + hipe_available_splits["v2.0"] = hipe_available_splits.get("v1.0").copy() + hipe_available_splits["v2.0"]["ajmc"] = {"de": ["train", "dev"], "en": ["train", "dev"], "fr": ["train", "dev"]} + eos_marker = "EndOfSentence" document_separator = "# hipe2022:document_id" # Special document marker for sample splits in AJMC dataset - if f"{version}/{dataset_name}" == "v1.0/ajmc": + if f"{dataset_name}" == "ajmc": document_separator = "# hipe2022:original_source" columns = {0: "text", 1: "ner"} dataset_base = self.__class__.__name__.lower() - data_folder = base_path / dataset_base / dataset_name / language + data_folder = base_path / dataset_base / version / dataset_name / language - data_url = f"https://github.com/hipe-eval/HIPE-2022-data/raw/main/data/{version}/{dataset_name}/{language}" + data_url = ( + f"https://github.com/hipe-eval/HIPE-2022-data/raw/{branch_name}/data/{version}/{dataset_name}/{language}" + ) dataset_splits = hipe_available_splits[version][dataset_name][language] From 51df948b1c9f10b804c02e70ac6b0c6591122450 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 21 Mar 2022 16:48:59 +0100 Subject: [PATCH 02/10] tests: update cases for v2 of HIPE-2022 dataset --- tests/test_datasets.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 789b070db..2ccf561fd 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,7 @@ import shutil +from importlib_metadata import version + import pytest import flair @@ -414,12 +416,29 @@ def test_hipe_2022_corpus(tasks_base_path): } } + hipe_stats["v2.0"] = hipe_stats.get("v1.0").copy() + hipe_stats["v2.0"]["ajmc"] = { + "de": { + "train": {"sents": 1022 + 2, "docs": 76}, # 2 sentences with missing EOS marker + "dev": {"sents": 192, "docs": 14}, + }, + "en": { + "train": {"sents": 1153 + 1, "docs": 60}, # 1 sentence with missing EOS marker + "dev": {"sents": 251 + 1, "docs": 14}, # 1 sentence with missing EOS marker + }, + "fr": { + "train": {"sents": 893 + 1, "docs": 72}, # 1 sentence with missing EOS marker + "dev": {"sents": 201 + 1, "docs": 17}, # 1 sentence with missing EOS marker + }, + } + def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): for dataset_name, languages in hipe_stats[dataset_version].items(): for language in languages: splits = languages[language] corpus = flair.datasets.NER_HIPE_2022( + version=dataset_version, dataset_name=dataset_name, language=language, dev_split_name="dev", @@ -441,6 +460,7 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" elif split_name == "dev2": corpus = flair.datasets.NER_HIPE_2022( + version=dataset_version, dataset_name=dataset_name, language=language, dev_split_name="dev2", @@ -451,8 +471,10 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): len(corpus.dev) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" - test_hipe_2022(add_document_separator=True) - test_hipe_2022(add_document_separator=False) + test_hipe_2022(dataset_version="v1.0", add_document_separator=True) + test_hipe_2022(dataset_version="v1.0", add_document_separator=False) + test_hipe_2022(dataset_version="v2.0", add_document_separator=True) + test_hipe_2022(dataset_version="v2.0", add_document_separator=False) def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): From eae619a1cec60219a069169b04ece7515639b529 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 22 Mar 2022 10:49:19 +0100 Subject: [PATCH 03/10] tests: minor flake fix for datasets --- tests/test_datasets.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 2ccf561fd..62c968092 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,7 +1,5 @@ import shutil -from importlib_metadata import version - import pytest import flair From 806a5692e4b723fabce6c30331898fb47d8082a4 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 22 Mar 2022 11:12:38 +0100 Subject: [PATCH 04/10] tests: adjust latest HIPE v2.0 data changes for SONAR and NewsEye dataset --- tests/test_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 62c968092..bbc83742c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -429,6 +429,10 @@ def test_hipe_2022_corpus(tasks_base_path): "dev": {"sents": 201 + 1, "docs": 17}, # 1 sentence with missing EOS marker }, } + hipe_stats["v2.0"]["newseye"] = {"de": {"train": {"sents": 20839 + 1, "docs": 7}}} # missing EOD marker + hipe_stats["v2.0"]["sonar"] = { + "de": {"dev": {"sents": 816 + 10, "docs": 10}} # 9 sentences with missing EOS marker + missing EOD + } def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): for dataset_name, languages in hipe_stats[dataset_version].items(): From 15cda851fa38cefc9530f97e00486f6390a4f2e8 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 22 Mar 2022 11:40:04 +0100 Subject: [PATCH 05/10] datasets: switch to main as default branch name for HIPE-2022 data repo --- flair/datasets/sequence_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 5db9bdf41..db3acd11c 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4137,7 +4137,7 @@ def __init__( tag_to_bioes: str = "ner", in_memory: bool = True, version: str = "v2.0", - branch_name: str = "release-v2.0", + branch_name: str = "main", dev_split_name="dev", add_document_separator=False, sample_missing_splits=False, From 49cca8b53e465593151516773a1d077fdd47fb6a Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 22 Mar 2022 17:58:58 +0100 Subject: [PATCH 06/10] datasets: introduce some bug fixes for HIPE-2022 (tab as delimiter, ignore empty tokens) --- flair/datasets/sequence_labeling.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index db3acd11c..04519b24f 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4242,6 +4242,7 @@ def __init__( in_memory=in_memory, document_separator_token="-DOCSTART-", skip_first_line=True, + column_delimiter="\t", comment_symbol="# ", sample_missing_splits=sample_missing_splits, **corpusargs, @@ -4259,7 +4260,11 @@ def __prepare_corpus( f_out.write(lines[0] + "\n") for line in lines[1:]: - line = line.rstrip() + if line.startswith(" "): + # Workaround for empty tokens + continue + + line = line.strip() # Add "real" document marker if add_document_separator and line.startswith(document_separator): From 3ea67b0ce34dbc3f9e660234db1638cfba1a83a7 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 22 Mar 2022 17:59:18 +0100 Subject: [PATCH 07/10] test: include label checking tests for HIPE-2022 --- tests/test_datasets.py | 142 ++++++++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 31 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index bbc83742c..54104027c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -373,65 +373,120 @@ def test_hipe_2022_corpus(tasks_base_path): # We have manually checked, that these numbers are correct: hipe_stats = { "v1.0": { - "ajmc": {"de": {"sample": {"sents": 119, "docs": 8}}, "en": {"sample": {"sents": 83, "docs": 5}}}, + "ajmc": { + "de": {"sample": {"sents": 119, "docs": 8, "labels": ["date", "loc", "pers", "scope", "work"]}}, + "en": {"sample": {"sents": 83, "docs": 5, "labels": ["date", "loc", "pers", "scope", "work"]}}, + }, "hipe2020": { "de": { - "train": {"sents": 3470 + 2, "docs": 103}, # 2 sentences with missing EOS marker - "dev": { - "sents": 1202, - "docs": 33, + "train": { + "sents": 3470 + 2, # 2 sentences with missing EOS marker + "docs": 103, + "labels": ["loc", "org", "pers", "prod", "time"], }, + "dev": {"sents": 1202, "docs": 33, "labels": ["loc", "org", "pers", "prod", "time"]}, + }, + "en": {"dev": {"sents": 1045, "docs": 80, "labels": ["loc", "org", "pers", "prod", "time"]}}, + "fr": { + "train": {"sents": 5743, "docs": 158, "labels": ["loc", "org", "pers", "prod", "time", "comp"]}, + "dev": {"sents": 1244, "docs": 43, "labels": ["loc", "org", "pers", "prod", "time"]}, }, - "en": {"dev": {"sents": 1045, "docs": 80}}, - "fr": {"train": {"sents": 5743, "docs": 158}, "dev": {"sents": 1244, "docs": 43}}, }, - "letemps": {"fr": {"train": {"sents": 14051, "docs": 414}, "dev": {"sents": 1341, "docs": 51}}}, + "letemps": { + "fr": { + "train": {"sents": 14051, "docs": 414, "labels": ["loc", "org", "pers"]}, + "dev": {"sents": 1341, "docs": 51, "labels": ["loc", "org", "pers"]}, + } + }, "newseye": { # +1 offset, because of missing EOS marker at EOD "de": { - "train": {"sents": 23646 + 1, "docs": 11}, - "dev": {"sents": 1110 + 1, "docs": 12}, - "dev2": {"sents": 1541 + 1, "docs": 12}, + "train": {"sents": 23646 + 1, "docs": 11, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 1110 + 1, "docs": 12, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 1541 + 1, "docs": 12, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, "fi": { - "train": {"sents": 1141 + 1, "docs": 24}, - "dev": {"sents": 140 + 1, "docs": 24}, - "dev2": {"sents": 104 + 1, "docs": 21}, + "train": {"sents": 1141 + 1, "docs": 24, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 140 + 1, "docs": 24, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 104 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, "fr": { - "train": {"sents": 7106 + 1, "docs": 35}, - "dev": {"sents": 662 + 1, "docs": 35}, - "dev2": {"sents": 1016 + 1, "docs": 35}, + "train": {"sents": 7106 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 662 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 1016 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, "sv": { - "train": {"sents": 1063 + 1, "docs": 21}, - "dev": {"sents": 126 + 1, "docs": 21}, - "dev2": {"sents": 136 + 1, "docs": 21}, + "train": {"sents": 1063 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 126 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 136 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, }, - "sonar": {"de": {"dev": {"sents": 1603 + 10, "docs": 10}}}, # 10 sentences with missing EOS marker - "topres19th": {"en": {"train": {"sents": 5874, "docs": 309}, "dev": {"sents": 646, "docs": 34}}}, + "sonar": { + "de": { + "dev": { + "sents": 1603 + 10, # 10 sentences with missing EOS marker + "docs": 10, + "labels": ["LOC", "ORG", "PER"], + } + } + }, + "topres19th": { + "en": { + "train": {"sents": 5874, "docs": 309, "labels": ["BUILDING", "LOC", "STREET"]}, + "dev": {"sents": 646, "docs": 34, "labels": ["BUILDING", "LOC", "STREET"]}, + } + }, } } hipe_stats["v2.0"] = hipe_stats.get("v1.0").copy() hipe_stats["v2.0"]["ajmc"] = { "de": { - "train": {"sents": 1022 + 2, "docs": 76}, # 2 sentences with missing EOS marker - "dev": {"sents": 192, "docs": 14}, + "train": { + "sents": 1022 + 2, # 2 sentences with missing EOS marker + "docs": 76, + "labels": ["date", "loc", "object", "pers", "scope", "work"], + }, + "dev": {"sents": 192, "docs": 14, "labels": ["loc", "object", "pers", "scope", "work"]}, }, "en": { - "train": {"sents": 1153 + 1, "docs": 60}, # 1 sentence with missing EOS marker - "dev": {"sents": 251 + 1, "docs": 14}, # 1 sentence with missing EOS marker + "train": { + "sents": 1153 + 1, # 1 sentence with missing EOS marker + "docs": 60, + "labels": ["date", "loc", "object", "pers", "scope", "work"], + }, + "dev": { + "sents": 251 + 1, # 1 sentence with missing EOS marker + "docs": 14, + "labels": ["date", "loc", "pers", "scope", "work"], + }, }, "fr": { - "train": {"sents": 893 + 1, "docs": 72}, # 1 sentence with missing EOS marker - "dev": {"sents": 201 + 1, "docs": 17}, # 1 sentence with missing EOS marker + "train": { + "sents": 893 + 1, # 1 sentence with missing EOS marker + "docs": 72, + "labels": ["date", "loc", "object", "pers", "scope", "work"], + }, + "dev": { + "sents": 201 + 1, # 1 sentence with missing EOS marker + "docs": 17, + "labels": ["pers", "scope", "work"], + }, }, } - hipe_stats["v2.0"]["newseye"] = {"de": {"train": {"sents": 20839 + 1, "docs": 7}}} # missing EOD marker + hipe_stats["v2.0"]["newseye"] = { + "de": { + "train": {"sents": 20839 + 1, "docs": 7, "labels": ["HumanProd", "LOC", "ORG", "PER"]} # missing EOD marker + } + } hipe_stats["v2.0"]["sonar"] = { - "de": {"dev": {"sents": 816 + 10, "docs": 10}} # 9 sentences with missing EOS marker + missing EOD + "de": { + "dev": { + "sents": 816 + 10, # 9 sentences with missing EOS marker + missing EOD + "docs": 10, + "labels": ["LOC", "ORG", "PER"], + } + } } def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): @@ -450,16 +505,34 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): for split_name, stats in splits.items(): split_description = f"{dataset_name}/{language}@{split_name}" - total_sentences = sum(stats.values()) if add_document_separator else stats["sents"] + current_sents = stats["sents"] + current_docs = stats["docs"] + current_labels = set(stats["labels"] + [""]) + + total_sentences = current_sents + current_docs if add_document_separator else stats["sents"] if split_name == "train": assert ( len(corpus.train) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.train)} vs. {total_sentences}" + + gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items()) + + assert ( + current_labels == gold_labels + ), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}" + elif split_name in ["dev", "sample"]: assert ( len(corpus.dev) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" + + corpus._train = corpus._dev + gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items()) + + assert ( + current_labels == gold_labels + ), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}" elif split_name == "dev2": corpus = flair.datasets.NER_HIPE_2022( version=dataset_version, @@ -469,10 +542,17 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): add_document_separator=add_document_separator, ) + corpus._train = corpus._dev + gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items()) + assert ( len(corpus.dev) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" + assert ( + current_labels == gold_labels + ), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}" + test_hipe_2022(dataset_version="v1.0", add_document_separator=True) test_hipe_2022(dataset_version="v1.0", add_document_separator=False) test_hipe_2022(dataset_version="v2.0", add_document_separator=True) From 88dc52ff478f627fd87c9bf971bd6e3042631093 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 22 Mar 2022 18:08:52 +0100 Subject: [PATCH 08/10] datasets: beautify emtpy token fix for HIPE-2022 dataset reader --- flair/datasets/sequence_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 04519b24f..8803c6671 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4260,7 +4260,7 @@ def __prepare_corpus( f_out.write(lines[0] + "\n") for line in lines[1:]: - if line.startswith(" "): + if line.startswith(" \t"): # Workaround for empty tokens continue From 7afcc24b6df582aa23f2d780c8b38aa2e1f8bc30 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 24 Mar 2022 09:16:26 +0100 Subject: [PATCH 09/10] tests: fix mypy error (hopefully) --- tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 54104027c..6ebeb258b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -439,7 +439,7 @@ def test_hipe_2022_corpus(tasks_base_path): } } - hipe_stats["v2.0"] = hipe_stats.get("v1.0").copy() + hipe_stats["v2.0"] = hipe_stats["v1.0"].copy() hipe_stats["v2.0"]["ajmc"] = { "de": { "train": { From 5e545c6a9cc730303517d8033906bf75231d3426 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 24 Mar 2022 09:19:15 +0100 Subject: [PATCH 10/10] datasets: fix mypy error (hopefully) --- flair/datasets/sequence_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 8803c6671..db84ff36f 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4183,7 +4183,7 @@ def __init__( } # v2.0 only adds new language and splits for AJMC dataset - hipe_available_splits["v2.0"] = hipe_available_splits.get("v1.0").copy() + hipe_available_splits["v2.0"] = hipe_available_splits["v1.0"].copy() hipe_available_splits["v2.0"]["ajmc"] = {"de": ["train", "dev"], "en": ["train", "dev"], "fr": ["train", "dev"]} eos_marker = "EndOfSentence"