Skip to content

Commit

Permalink
Merge pull request #2227 from mart1nro/add_naija_pidgin_ner_corpus
Browse files Browse the repository at this point in the history
Added Naija Pidgin NER dataset (Introduction to NLP @ HU Berlin)
  • Loading branch information
alanakbik authored Apr 19, 2021
2 parents 6cf6c03 + 5a3ce09 commit 8392dc6
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 2 deletions.
1 change: 1 addition & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
from .sequence_labeling import MIT_RESTAURANT_NER
from .sequence_labeling import NAIJA_PIDGIN_NER
from .sequence_labeling import NER_BASQUE
from .sequence_labeling import NER_FINNISH
from .sequence_labeling import NER_SWEDISH
Expand Down
51 changes: 49 additions & 2 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1594,7 +1594,7 @@ def __init__(
**corpusargs,
)


class KINYARWANDA_NER(ColumnCorpus):
def __init__(
self,
Expand All @@ -1603,7 +1603,6 @@ def __init__(
in_memory: bool = True,
**corpusargs,
):

if type(base_path) == str:
base_path: Path = Path(base_path)

Expand Down Expand Up @@ -1632,6 +1631,54 @@ def __init__(
in_memory=in_memory,
**corpusargs,
)


class NAIJA_PIDGIN_NER(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
**corpusargs,
):
"""
Initialize the Naija Pidgin corpus for NER available on:
https://github.com/masakhane-io/masakhane-ner/tree/main/data/pcm
The first time you call this constructor it will automatically download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
POS tags instead
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)

# column format
columns = {0: "text", 1: "ner"}

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

corpus_path = "https://github.com/masakhane-io/masakhane-ner/main/data/pcm/"

cached_path(f"{corpus_path}test.txt", Path("datasets") / dataset_name)
cached_path(f"{corpus_path}train.txt", Path("datasets") / dataset_name)
cached_path(f"{corpus_path}dev.txt", Path("datasets") / dataset_name)

super(NAIJA_PIDGIN_NER, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
in_memory=in_memory,
**corpusargs,
)


class NER_BASQUE(ColumnCorpus):
Expand Down

0 comments on commit 8392dc6

Please sign in to comment.