diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 2d850466b..68bcfa363 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -202,6 +202,7 @@ NER_MULTI_WIKIANN, NER_MULTI_WIKINER, NER_MULTI_XTREME, + NER_NERMUD, NER_SWEDISH, NER_TURKU, NER_UKRAINIAN, @@ -465,6 +466,7 @@ "NER_ICDAR_EUROPEANA", "NER_ICELANDIC", "NER_JAPANESE", + "NER_NERMUD", "NER_MASAKHANE", "NER_MULTI_WIKIANN", "NER_MULTI_WIKINER", diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 024b22569..527b038fb 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4386,3 +4386,80 @@ def __init__( column_delimiter="\t", **corpusargs, ) + + +class NER_NERMUD(MultiCorpus): + def __init__( + self, + domains: Union[str, List[str]] = "all", + base_path: Union[str, Path] = None, + in_memory: bool = False, + **corpusargs, + ): + """ + Initilize the NERMuD 2023 dataset. NERMuD is a task presented at EVALITA 2023 consisting in the extraction and classification + of named-entities in a document, such as persons, organizations, and locations. NERMuD 2023 will include two different sub-tasks: + + - Domain-agnostic classification (DAC). Participants will be asked to select and classify entities among three categories + (person, organization, location) in different types of texts (news, fiction, political speeches) using one single general model. + + - Domain-specific classification (DSC). Participants will be asked to deploy a different model for each of the above types, + trying to increase the accuracy for each considered type. + + :param domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all". + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + """ + supported_domains = ["WN", "FIC", "ADG"] + + if type(domains) == str and domains == "all": + domains = supported_domains + + if type(domains) == str: + domains = [domains] + + if not base_path: + base_path = flair.cache_root / "datasets" + else: + base_path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + data_folder = base_path / dataset_name + + corpora: List[Corpus] = [] + + github_path = "https://raw.githubusercontent.com/dhfbk/KIND/main/evalita-2023" + + for domain in domains: + if domain not in supported_domains: + log.error(f"Domain '{domain}' is not in list of supported domains!") + log.error(f"Supported are '{supported_domains}'!") + raise Exception() + + domain_folder = data_folder / domain.lower() + + for split in ["train", "dev"]: + cached_path(f"{github_path}/{domain}_{split}.tsv", domain_folder) + + corpus = ColumnCorpus( + data_folder=domain_folder, + train_file=f"{domain}_train.tsv", + dev_file=f"{domain}_dev.tsv", + test_file=None, + column_format=columns, + in_memory=in_memory, + sample_missing_splits=False, # No test data is available, so do not shrink dev data for shared task preparation! + **corpusargs, + ) + corpora.append(corpus) + super(NER_NERMUD, self).__init__( + corpora, + sample_missing_splits=False, + name="nermud", + ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 4a8765ff9..b6d253a1a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -788,6 +788,26 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) +def test_nermud_corpus(tasks_base_path): + """ + This test covers the NERMuD dataset. Official stats can be found here: + https://github.com/dhfbk/KIND/tree/main/evalita-2023 + """ + gold_stats = { + "WN": {"train": 10912, "dev": 2594}, + "FIC": {"train": 11423, "dev": 1051}, + "ADG": {"train": 5147, "dev": 1122}, + } + + def check_number_sentences(reference: int, actual: int, split_name: str): + assert actual == reference, f"Mismatch in number of sentences for {split_name} split" + + for domain, stats in gold_stats.items(): + corpus = flair.datasets.NER_NERMUD(domains=domain) + check_number_sentences(len(corpus.train), stats["train"], "train") + check_number_sentences(len(corpus.dev), stats["dev"], "dev") + + def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): corpus = MultiFileJsonlCorpus( train_files=[tasks_base_path / "jsonl/train.jsonl"],