From b1a356fb298ab2ca15b12640aaa41bb99f25a430 Mon Sep 17 00:00:00 2001 From: Robert Martin Date: Sun, 18 Apr 2021 12:10:51 +0200 Subject: [PATCH 1/3] added Naija Pidgin NER dataset --- flair/datasets/__init__.py | 1 + flair/datasets/sequence_labeling.py | 49 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 98eeb46d2..9230e3bfb 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -27,6 +27,7 @@ from .sequence_labeling import MIT_MOVIE_NER_SIMPLE from .sequence_labeling import MIT_MOVIE_NER_COMPLEX from .sequence_labeling import MIT_RESTAURANT_NER +from .sequence_labeling import NAIJA_PIDGIN_NER from .sequence_labeling import NER_BASQUE from .sequence_labeling import NER_FINNISH from .sequence_labeling import NER_SWEDISH diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 33e6b1d56..1678984f5 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -1548,6 +1548,55 @@ def __init__( ) +class NAIJA_PIDGIN_NER(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the Naija Pidgin corpus for NER available on: + https://github.com/masakhane-io/masakhane-ner/tree/main/data/pcm + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + model_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/" + + cached_path(f"{model_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{model_path}train.txt", Path("datasets") / dataset_name) + cached_path(f"{model_path}dev.txt", Path("datasets") / dataset_name) + + super(NAIJA_PIDGIN_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + **corpusargs, + ) + + class NER_BASQUE(ColumnCorpus): def __init__( self, From 4b3da9655c145aabf4af5612d0ac51a458eb6723 Mon Sep 17 00:00:00 2001 From: Robert Martin Date: Sun, 18 Apr 2021 13:00:32 +0200 Subject: [PATCH 2/3] renamed path variable --- flair/datasets/sequence_labeling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 1678984f5..e985c8a38 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -1582,11 +1582,11 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - model_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/" + corpus_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/" - cached_path(f"{model_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{model_path}train.txt", Path("datasets") / dataset_name) - cached_path(f"{model_path}dev.txt", Path("datasets") / dataset_name) + cached_path(f"{corpus_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{corpus_path}train.txt", Path("datasets") / dataset_name) + cached_path(f"{corpus_path}dev.txt", Path("datasets") / dataset_name) super(NAIJA_PIDGIN_NER, self).__init__( data_folder, From 5a3ce09a2a990e99438c2b34c35494624125854f Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Mon, 19 Apr 2021 15:51:24 +0200 Subject: [PATCH 3/3] correct errors from merge --- flair/datasets/sequence_labeling.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index b12063959..7353e5e6e 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -1652,7 +1652,20 @@ def __init__( :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + corpus_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/" cached_path(f"{corpus_path}test.txt", Path("datasets") / dataset_name)