Skip to content

Commit

Permalink
Merge pull request #2242 from Leonard-Hangen/master
Browse files Browse the repository at this point in the history
Add Luganda_NER Support
  • Loading branch information
alanakbik authored Apr 21, 2021
2 parents 87638b3 + 95aaed2 commit 263cd7a
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 2 deletions.
1 change: 1 addition & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .sequence_labeling import JAPANESE_NER
from .sequence_labeling import KINYARWANDA_NER
from .sequence_labeling import LER_GERMAN
from .sequence_labeling import LUGANDA_NER
from .sequence_labeling import LUO_NER
from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
Expand Down
56 changes: 55 additions & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1878,7 +1878,61 @@ def __init__(
in_memory=in_memory,
**corpusargs,
)


class LUGANDA_NER(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
**corpusargs,
):
"""
Initialize the LugandaNER corpus. The first time you call this constructor it will automatically
download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
POS tags instead
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)

# column format
columns = {0: "text", 1: "ner"}

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
luganda_ner_path = "https://github.com/masakhane-io/masakhane-ner/main/data/lug/"
dev_file = "dev.txt"
test_file = "test.txt"
train_file = "train.txt"
cached_path(f"{luganda_ner_path}/{dev_file}", Path("datasets") / dataset_name)
cached_path(f"{luganda_ner_path}/{test_file}", Path("datasets") / dataset_name)
cached_path(f"{luganda_ner_path}/{train_file}", Path("datasets") / dataset_name)

super(LUGANDA_NER, self).__init__(
data_folder,
columns,
dev_file=dev_file,
test_file=test_file,
train_file=train_file,
column_delimiter= " ",
tag_to_bioes=tag_to_bioes,
encoding="latin-1",
in_memory=in_memory,
document_separator_token="-DOCSTART-",
**corpusargs,
)

class NAIJA_PIDGIN_NER(ColumnCorpus):
def __init__(
Expand Down
3 changes: 2 additions & 1 deletion resources/docs/TUTORIAL_6_CORPUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ data the first time you call the corresponding constructor ID. The following dat
| 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) |
| 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches |
| 'JAPANESE_NER' | Japanese | [https://github.com/Hironsan/IOB2Corpus] Japanese NER dataset automatically generated from Wikipedia |
| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents |
| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents |
| 'LUGANDA_NER' | Luganda | [LUGANDA_NER](https://github.com/masakhane-io/masakhane-ner/tree/main/data/lug) |
| 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
| 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
| 'MIT_RESTAURANT_NER' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
Expand Down

0 comments on commit 263cd7a

Please sign in to comment.