Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for NERMuD 2023 Dataset #3087

Merged
merged 4 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@
NER_MULTI_WIKIANN,
NER_MULTI_WIKINER,
NER_MULTI_XTREME,
NER_NERMUD,
NER_SWEDISH,
NER_TURKU,
NER_UKRAINIAN,
Expand Down Expand Up @@ -465,6 +466,7 @@
"NER_ICDAR_EUROPEANA",
"NER_ICELANDIC",
"NER_JAPANESE",
"NER_NERMUD",
"NER_MASAKHANE",
"NER_MULTI_WIKIANN",
"NER_MULTI_WIKINER",
Expand Down
77 changes: 77 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4386,3 +4386,80 @@ def __init__(
column_delimiter="\t",
**corpusargs,
)


class NER_NERMUD(MultiCorpus):
def __init__(
self,
domains: Union[str, List[str]] = "all",
base_path: Union[str, Path] = None,
in_memory: bool = False,
**corpusargs,
):
"""
Initilize the NERMuD 2023 dataset. NERMuD is a task presented at EVALITA 2023 consisting in the extraction and classification
of named-entities in a document, such as persons, organizations, and locations. NERMuD 2023 will include two different sub-tasks:

- Domain-agnostic classification (DAC). Participants will be asked to select and classify entities among three categories
(person, organization, location) in different types of texts (news, fiction, political speeches) using one single general model.

- Domain-specific classification (DSC). Participants will be asked to deploy a different model for each of the above types,
trying to increase the accuracy for each considered type.

:param domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all".
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
"""
supported_domains = ["WN", "FIC", "ADG"]

if type(domains) == str and domains == "all":
domains = supported_domains

if type(domains) == str:
domains = [domains]

if not base_path:
base_path = flair.cache_root / "datasets"
else:
base_path = Path(base_path)

# column format
columns = {0: "text", 1: "ner"}

# this dataset name
dataset_name = self.__class__.__name__.lower()

data_folder = base_path / dataset_name

corpora: List[Corpus] = []

github_path = "https://github.com/dhfbk/KIND/main/evalita-2023"

for domain in domains:
if domain not in supported_domains:
log.error(f"Domain '{domain}' is not in list of supported domains!")
log.error(f"Supported are '{supported_domains}'!")
raise Exception()

domain_folder = data_folder / domain.lower()

for split in ["train", "dev"]:
cached_path(f"{github_path}/{domain}_{split}.tsv", domain_folder)

corpus = ColumnCorpus(
data_folder=domain_folder,
train_file=f"{domain}_train.tsv",
dev_file=f"{domain}_dev.tsv",
test_file=None,
column_format=columns,
in_memory=in_memory,
sample_missing_splits=False, # No test data is available, so do not shrink dev data for shared task preparation!
**corpusargs,
)
corpora.append(corpus)
super(NER_NERMUD, self).__init__(
corpora,
sample_missing_splits=False,
name="nermud",
)
20 changes: 20 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,26 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)


def test_nermud_corpus(tasks_base_path):
"""
This test covers the NERMuD dataset. Official stats can be found here:
https://github.com/dhfbk/KIND/tree/main/evalita-2023
"""
gold_stats = {
"WN": {"train": 10912, "dev": 2594},
"FIC": {"train": 11423, "dev": 1051},
"ADG": {"train": 5147, "dev": 1122},
}

def check_number_sentences(reference: int, actual: int, split_name: str):
assert actual == reference, f"Mismatch in number of sentences for {split_name} split"

for domain, stats in gold_stats.items():
corpus = flair.datasets.NER_NERMUD(domains=domain)
check_number_sentences(len(corpus.train), stats["train"], "train")
check_number_sentences(len(corpus.dev), stats["dev"], "dev")


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down