From b1a356fb298ab2ca15b12640aaa41bb99f25a430 Mon Sep 17 00:00:00 2001
From: Robert Martin <martinro@informatik.hu-berlin.de>
Date: Sun, 18 Apr 2021 12:10:51 +0200
Subject: [PATCH 1/3] added Naija Pidgin NER dataset

---
 flair/datasets/__init__.py          |  1 +
 flair/datasets/sequence_labeling.py | 49 +++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 98eeb46d2..9230e3bfb 100755
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -27,6 +27,7 @@
 from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
 from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
 from .sequence_labeling import MIT_RESTAURANT_NER
+from .sequence_labeling import NAIJA_PIDGIN_NER
 from .sequence_labeling import NER_BASQUE
 from .sequence_labeling import NER_FINNISH
 from .sequence_labeling import NER_SWEDISH
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 33e6b1d56..1678984f5 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -1548,6 +1548,55 @@ def __init__(
         )
 
 
+class NAIJA_PIDGIN_NER(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the Naija Pidgin corpus for NER available on:
+        https://github.com/masakhane-io/masakhane-ner/tree/main/data/pcm
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        model_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/"
+
+        cached_path(f"{model_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{model_path}train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{model_path}dev.txt", Path("datasets") / dataset_name)
+
+        super(NAIJA_PIDGIN_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            **corpusargs,
+        )
+
+
 class NER_BASQUE(ColumnCorpus):
     def __init__(
             self,

From 4b3da9655c145aabf4af5612d0ac51a458eb6723 Mon Sep 17 00:00:00 2001
From: Robert Martin <martinro@informatik.hu-berlin.de>
Date: Sun, 18 Apr 2021 13:00:32 +0200
Subject: [PATCH 2/3] renamed path variable

---
 flair/datasets/sequence_labeling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 1678984f5..e985c8a38 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -1582,11 +1582,11 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        model_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/"
+        corpus_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/"
 
-        cached_path(f"{model_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{model_path}train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{model_path}dev.txt", Path("datasets") / dataset_name)
+        cached_path(f"{corpus_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{corpus_path}train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{corpus_path}dev.txt", Path("datasets") / dataset_name)
 
         super(NAIJA_PIDGIN_NER, self).__init__(
             data_folder,

From 5a3ce09a2a990e99438c2b34c35494624125854f Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Mon, 19 Apr 2021 15:51:24 +0200
Subject: [PATCH 3/3] correct errors from merge

---
 flair/datasets/sequence_labeling.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index b12063959..7353e5e6e 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -1652,7 +1652,20 @@ def __init__(
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+        
         corpus_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/pcm/"
 
         cached_path(f"{corpus_path}test.txt", Path("datasets") / dataset_name)