Merge pull request #2227 from mart1nro/add_naija_pidgin_ner_corpus

Added Naija Pidgin NER dataset (Introduction to NLP @ HU Berlin)
flairNLP · Apr 19, 2021 · 8392dc6 · 8392dc6
2 parents 6cf6c03 + 5a3ce09
commit 8392dc6
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 2 deletions.
diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
@@ -29,6 +29,7 @@
 from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
 from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
 from .sequence_labeling import MIT_RESTAURANT_NER
+from .sequence_labeling import NAIJA_PIDGIN_NER
 from .sequence_labeling import NER_BASQUE
 from .sequence_labeling import NER_FINNISH
 from .sequence_labeling import NER_SWEDISH

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -1594,7 +1594,7 @@ def __init__(
             **corpusargs,
         )
 
-
+        
 class KINYARWANDA_NER(ColumnCorpus):
     def __init__(
             self,
@@ -1603,7 +1603,6 @@ def __init__(
             in_memory: bool = True,
             **corpusargs,
     ):
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1632,6 +1631,54 @@ def __init__(
             in_memory=in_memory,
             **corpusargs,
         )
+
+
+class NAIJA_PIDGIN_NER(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the Naija Pidgin corpus for NER available on:
+        https://github.com/masakhane-io/masakhane-ner/tree/main/data/pcm
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        corpus_path = "https://github.com/masakhane-io/masakhane-ner/main/data/pcm/"
+
+        cached_path(f"{corpus_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{corpus_path}train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{corpus_path}dev.txt", Path("datasets") / dataset_name)
+
+        super(NAIJA_PIDGIN_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            **corpusargs,
+        )
 
 
 class NER_BASQUE(ColumnCorpus):