diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 18bd46be9..db84ff36f 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4136,7 +4136,8 @@ def __init__( base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - version: str = "v1.0", + version: str = "v2.0", + branch_name: str = "main", dev_split_name="dev", add_document_separator=False, sample_missing_splits=False, @@ -4152,6 +4153,7 @@ def __init__( :tag_to_bioes: Dataset will automatically transformed into BIOES format (internally). :in_memory: If True, keeps dataset in memory giving speedups in training. :version: Version of CLEF-HIPE dataset. Currently only v1.0 is supported and available. + :branch_name: Defines git branch name of HIPE data repository (main by default). :dev_split_name: Defines default name of development split (dev by default). Only the NewsEye dataset has currently two development splits: dev and dev2. :add_document_separator: If True, a special document seperator will be introduced. This is highly @@ -4180,19 +4182,25 @@ def __init__( } } + # v2.0 only adds new language and splits for AJMC dataset + hipe_available_splits["v2.0"] = hipe_available_splits["v1.0"].copy() + hipe_available_splits["v2.0"]["ajmc"] = {"de": ["train", "dev"], "en": ["train", "dev"], "fr": ["train", "dev"]} + eos_marker = "EndOfSentence" document_separator = "# hipe2022:document_id" # Special document marker for sample splits in AJMC dataset - if f"{version}/{dataset_name}" == "v1.0/ajmc": + if f"{dataset_name}" == "ajmc": document_separator = "# hipe2022:original_source" columns = {0: "text", 1: "ner"} dataset_base = self.__class__.__name__.lower() - data_folder = base_path / dataset_base / dataset_name / language + data_folder = base_path / dataset_base / version / dataset_name / language - data_url = f"https://github.com/hipe-eval/HIPE-2022-data/raw/main/data/{version}/{dataset_name}/{language}" + data_url = ( + f"https://github.com/hipe-eval/HIPE-2022-data/raw/{branch_name}/data/{version}/{dataset_name}/{language}" + ) dataset_splits = hipe_available_splits[version][dataset_name][language] @@ -4234,6 +4242,7 @@ def __init__( in_memory=in_memory, document_separator_token="-DOCSTART-", skip_first_line=True, + column_delimiter="\t", comment_symbol="# ", sample_missing_splits=sample_missing_splits, **corpusargs, @@ -4251,7 +4260,11 @@ def __prepare_corpus( f_out.write(lines[0] + "\n") for line in lines[1:]: - line = line.rstrip() + if line.startswith(" \t"): + # Workaround for empty tokens + continue + + line = line.strip() # Add "real" document marker if add_document_separator and line.startswith(document_separator): diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 789b070db..6ebeb258b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -373,44 +373,119 @@ def test_hipe_2022_corpus(tasks_base_path): # We have manually checked, that these numbers are correct: hipe_stats = { "v1.0": { - "ajmc": {"de": {"sample": {"sents": 119, "docs": 8}}, "en": {"sample": {"sents": 83, "docs": 5}}}, + "ajmc": { + "de": {"sample": {"sents": 119, "docs": 8, "labels": ["date", "loc", "pers", "scope", "work"]}}, + "en": {"sample": {"sents": 83, "docs": 5, "labels": ["date", "loc", "pers", "scope", "work"]}}, + }, "hipe2020": { "de": { - "train": {"sents": 3470 + 2, "docs": 103}, # 2 sentences with missing EOS marker - "dev": { - "sents": 1202, - "docs": 33, + "train": { + "sents": 3470 + 2, # 2 sentences with missing EOS marker + "docs": 103, + "labels": ["loc", "org", "pers", "prod", "time"], }, + "dev": {"sents": 1202, "docs": 33, "labels": ["loc", "org", "pers", "prod", "time"]}, + }, + "en": {"dev": {"sents": 1045, "docs": 80, "labels": ["loc", "org", "pers", "prod", "time"]}}, + "fr": { + "train": {"sents": 5743, "docs": 158, "labels": ["loc", "org", "pers", "prod", "time", "comp"]}, + "dev": {"sents": 1244, "docs": 43, "labels": ["loc", "org", "pers", "prod", "time"]}, }, - "en": {"dev": {"sents": 1045, "docs": 80}}, - "fr": {"train": {"sents": 5743, "docs": 158}, "dev": {"sents": 1244, "docs": 43}}, }, - "letemps": {"fr": {"train": {"sents": 14051, "docs": 414}, "dev": {"sents": 1341, "docs": 51}}}, + "letemps": { + "fr": { + "train": {"sents": 14051, "docs": 414, "labels": ["loc", "org", "pers"]}, + "dev": {"sents": 1341, "docs": 51, "labels": ["loc", "org", "pers"]}, + } + }, "newseye": { # +1 offset, because of missing EOS marker at EOD "de": { - "train": {"sents": 23646 + 1, "docs": 11}, - "dev": {"sents": 1110 + 1, "docs": 12}, - "dev2": {"sents": 1541 + 1, "docs": 12}, + "train": {"sents": 23646 + 1, "docs": 11, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 1110 + 1, "docs": 12, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 1541 + 1, "docs": 12, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, "fi": { - "train": {"sents": 1141 + 1, "docs": 24}, - "dev": {"sents": 140 + 1, "docs": 24}, - "dev2": {"sents": 104 + 1, "docs": 21}, + "train": {"sents": 1141 + 1, "docs": 24, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 140 + 1, "docs": 24, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 104 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, "fr": { - "train": {"sents": 7106 + 1, "docs": 35}, - "dev": {"sents": 662 + 1, "docs": 35}, - "dev2": {"sents": 1016 + 1, "docs": 35}, + "train": {"sents": 7106 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 662 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 1016 + 1, "docs": 35, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, "sv": { - "train": {"sents": 1063 + 1, "docs": 21}, - "dev": {"sents": 126 + 1, "docs": 21}, - "dev2": {"sents": 136 + 1, "docs": 21}, + "train": {"sents": 1063 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev": {"sents": 126 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, + "dev2": {"sents": 136 + 1, "docs": 21, "labels": ["HumanProd", "LOC", "ORG", "PER"]}, }, }, - "sonar": {"de": {"dev": {"sents": 1603 + 10, "docs": 10}}}, # 10 sentences with missing EOS marker - "topres19th": {"en": {"train": {"sents": 5874, "docs": 309}, "dev": {"sents": 646, "docs": 34}}}, + "sonar": { + "de": { + "dev": { + "sents": 1603 + 10, # 10 sentences with missing EOS marker + "docs": 10, + "labels": ["LOC", "ORG", "PER"], + } + } + }, + "topres19th": { + "en": { + "train": {"sents": 5874, "docs": 309, "labels": ["BUILDING", "LOC", "STREET"]}, + "dev": {"sents": 646, "docs": 34, "labels": ["BUILDING", "LOC", "STREET"]}, + } + }, + } + } + + hipe_stats["v2.0"] = hipe_stats["v1.0"].copy() + hipe_stats["v2.0"]["ajmc"] = { + "de": { + "train": { + "sents": 1022 + 2, # 2 sentences with missing EOS marker + "docs": 76, + "labels": ["date", "loc", "object", "pers", "scope", "work"], + }, + "dev": {"sents": 192, "docs": 14, "labels": ["loc", "object", "pers", "scope", "work"]}, + }, + "en": { + "train": { + "sents": 1153 + 1, # 1 sentence with missing EOS marker + "docs": 60, + "labels": ["date", "loc", "object", "pers", "scope", "work"], + }, + "dev": { + "sents": 251 + 1, # 1 sentence with missing EOS marker + "docs": 14, + "labels": ["date", "loc", "pers", "scope", "work"], + }, + }, + "fr": { + "train": { + "sents": 893 + 1, # 1 sentence with missing EOS marker + "docs": 72, + "labels": ["date", "loc", "object", "pers", "scope", "work"], + }, + "dev": { + "sents": 201 + 1, # 1 sentence with missing EOS marker + "docs": 17, + "labels": ["pers", "scope", "work"], + }, + }, + } + hipe_stats["v2.0"]["newseye"] = { + "de": { + "train": {"sents": 20839 + 1, "docs": 7, "labels": ["HumanProd", "LOC", "ORG", "PER"]} # missing EOD marker + } + } + hipe_stats["v2.0"]["sonar"] = { + "de": { + "dev": { + "sents": 816 + 10, # 9 sentences with missing EOS marker + missing EOD + "docs": 10, + "labels": ["LOC", "ORG", "PER"], + } } } @@ -420,6 +495,7 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): splits = languages[language] corpus = flair.datasets.NER_HIPE_2022( + version=dataset_version, dataset_name=dataset_name, language=language, dev_split_name="dev", @@ -429,30 +505,58 @@ def test_hipe_2022(dataset_version="v1.0", add_document_separator=True): for split_name, stats in splits.items(): split_description = f"{dataset_name}/{language}@{split_name}" - total_sentences = sum(stats.values()) if add_document_separator else stats["sents"] + current_sents = stats["sents"] + current_docs = stats["docs"] + current_labels = set(stats["labels"] + [""]) + + total_sentences = current_sents + current_docs if add_document_separator else stats["sents"] if split_name == "train": assert ( len(corpus.train) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.train)} vs. {total_sentences}" + + gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items()) + + assert ( + current_labels == gold_labels + ), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}" + elif split_name in ["dev", "sample"]: assert ( len(corpus.dev) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" + + corpus._train = corpus._dev + gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items()) + + assert ( + current_labels == gold_labels + ), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}" elif split_name == "dev2": corpus = flair.datasets.NER_HIPE_2022( + version=dataset_version, dataset_name=dataset_name, language=language, dev_split_name="dev2", add_document_separator=add_document_separator, ) + corpus._train = corpus._dev + gold_labels = set(corpus.make_label_dictionary(label_type="ner").get_items()) + assert ( len(corpus.dev) == total_sentences ), f"Sentence count mismatch for {split_description}: {len(corpus.dev)} vs. {total_sentences}" - test_hipe_2022(add_document_separator=True) - test_hipe_2022(add_document_separator=False) + assert ( + current_labels == gold_labels + ), f"Label mismatch for {split_description}: {current_labels} vs. {gold_labels}" + + test_hipe_2022(dataset_version="v1.0", add_document_separator=True) + test_hipe_2022(dataset_version="v1.0", add_document_separator=False) + test_hipe_2022(dataset_version="v2.0", add_document_separator=True) + test_hipe_2022(dataset_version="v2.0", add_document_separator=False) def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):