flairNLP · alanakbik · Feb 14, 2023 · Feb 7, 2023 · Feb 7, 2023 · Feb 7, 2023
diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
@@ -202,6 +202,7 @@
     NER_MULTI_WIKIANN,
     NER_MULTI_WIKINER,
     NER_MULTI_XTREME,
+    NER_NERMUD,
     NER_SWEDISH,
     NER_TURKU,
     NER_UKRAINIAN,
@@ -465,6 +466,7 @@
     "NER_ICDAR_EUROPEANA",
     "NER_ICELANDIC",
     "NER_JAPANESE",
+    "NER_NERMUD",
     "NER_MASAKHANE",
     "NER_MULTI_WIKIANN",
     "NER_MULTI_WIKINER",

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -4386,3 +4386,80 @@ def __init__(
             column_delimiter="\t",
             **corpusargs,
         )
+
+
+class NER_NERMUD(MultiCorpus):
+    def __init__(
+        self,
+        domains: Union[str, List[str]] = "all",
+        base_path: Union[str, Path] = None,
+        in_memory: bool = False,
+        **corpusargs,
+    ):
+        """
+        Initilize the NERMuD 2023 dataset. NERMuD is a task presented at EVALITA 2023 consisting in the extraction and classification
+        of named-entities in a document, such as persons, organizations, and locations. NERMuD 2023 will include two different sub-tasks:
+
+        - Domain-agnostic classification (DAC). Participants will be asked to select and classify entities among three categories
+          (person, organization, location) in different types of texts (news, fiction, political speeches) using one single general model.
+
+        - Domain-specific classification (DSC). Participants will be asked to deploy a different model for each of the above types,
+          trying to increase the accuracy for each considered type.
+
+        :param domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all".
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        """
+        supported_domains = ["WN", "FIC", "ADG"]
+
+        if type(domains) == str and domains == "all":
+            domains = supported_domains
+
+        if type(domains) == str:
+            domains = [domains]
+
+        if not base_path:
+            base_path = flair.cache_root / "datasets"
+        else:
+            base_path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        corpora: List[Corpus] = []
+
+        github_path = "https://github.com/dhfbk/KIND/main/evalita-2023"
+
+        for domain in domains:
+            if domain not in supported_domains:
+                log.error(f"Domain '{domain}' is not in list of supported domains!")
+                log.error(f"Supported are '{supported_domains}'!")
+                raise Exception()
+
+            domain_folder = data_folder / domain.lower()
+
+            for split in ["train", "dev"]:
+                cached_path(f"{github_path}/{domain}_{split}.tsv", domain_folder)
+
+            corpus = ColumnCorpus(
+                data_folder=domain_folder,
+                train_file=f"{domain}_train.tsv",
+                dev_file=f"{domain}_dev.tsv",
+                test_file=None,
+                column_format=columns,
+                in_memory=in_memory,
+                sample_missing_splits=False,  # No test data is available, so do not shrink dev data for shared task preparation!
+                **corpusargs,
+            )
+            corpora.append(corpus)
+        super(NER_NERMUD, self).__init__(
+            corpora,
+            sample_missing_splits=False,
+            name="nermud",
+        )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -788,6 +788,26 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
             check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
+def test_nermud_corpus(tasks_base_path):
+    """
+    This test covers the NERMuD dataset. Official stats can be found here:
+    https://github.com/dhfbk/KIND/tree/main/evalita-2023
+    """
+    gold_stats = {
+        "WN": {"train": 10912, "dev": 2594},
+        "FIC": {"train": 11423, "dev": 1051},
+        "ADG": {"train": 5147, "dev": 1122},
+    }
+
+    def check_number_sentences(reference: int, actual: int, split_name: str):
+        assert actual == reference, f"Mismatch in number of sentences for {split_name} split"
+
+    for domain, stats in gold_stats.items():
+        corpus = flair.datasets.NER_NERMUD(domains=domain)
+        check_number_sentences(len(corpus.train), stats["train"], "train")
+        check_number_sentences(len(corpus.dev), stats["dev"], "dev")
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],