From 456473da9fccc66b32957c6dc5907cc00d264849 Mon Sep 17 00:00:00 2001 From: Leonard-Hangen <56963352+Leonard-Hangen@users.noreply.github.com> Date: Wed, 21 Apr 2021 14:36:58 +0200 Subject: [PATCH 1/2] Add Luganda NER Support Luganda or the Ganda language is a Bantu language spoken in the African Great Lakes region. It is one of the major languages in Uganda and is spoken by more than eight million Baganda and other people. ~Source Wikipedia: (https://en.wikipedia.org/wiki/Luganda) --- flair/datasets/__init__.py | 1 + flair/datasets/sequence_labeling.py | 56 ++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 20d2299d5..9ed0a6a17 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -27,6 +27,7 @@ from .sequence_labeling import JAPANESE_NER from .sequence_labeling import KINYARWANDA_NER from .sequence_labeling import LER_GERMAN +from .sequence_labeling import LUGANDA_NER from .sequence_labeling import LUO_NER from .sequence_labeling import MIT_MOVIE_NER_SIMPLE from .sequence_labeling import MIT_MOVIE_NER_COMPLEX diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 2b8fe4662..95192a609 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -1878,7 +1878,61 @@ def __init__( in_memory=in_memory, **corpusargs, ) - + +class LUGANDA_NER(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the LugandaNER corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + luganda_ner_path = "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/lug/" + dev_file = "dev.txt" + test_file = "test.txt" + train_file = "train.txt" + cached_path(f"{luganda_ner_path}/{dev_file}", Path("datasets") / dataset_name) + cached_path(f"{luganda_ner_path}/{test_file}", Path("datasets") / dataset_name) + cached_path(f"{luganda_ner_path}/{train_file}", Path("datasets") / dataset_name) + + super(LUGANDA_NER, self).__init__( + data_folder, + columns, + dev_file=dev_file, + test_file=test_file, + train_file=train_file, + column_delimiter= " ", + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + in_memory=in_memory, + document_separator_token="-DOCSTART-", + **corpusargs, + ) class NAIJA_PIDGIN_NER(ColumnCorpus): def __init__( From 95aaed26c5109b372787c74621511600fa7e5eff Mon Sep 17 00:00:00 2001 From: Leonard-Hangen <56963352+Leonard-Hangen@users.noreply.github.com> Date: Wed, 21 Apr 2021 14:38:12 +0200 Subject: [PATCH 2/2] Add The Luganda Language to the summery In Tutorial 6 there is a summery of the languages Supported. With this commit The NER Support for the Luganda Laguage will be added to this list. --- resources/docs/TUTORIAL_6_CORPUS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index 5cd979065..9dda5c75a 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -174,7 +174,8 @@ data the first time you call the corresponding constructor ID. The following dat | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | | 'JAPANESE_NER' | Japanese | [https://github.com/Hironsan/IOB2Corpus] Japanese NER dataset automatically generated from Wikipedia | -| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | +| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | +| 'LUGANDA_NER' | Luganda | [LUGANDA_NER](https://github.com/masakhane-io/masakhane-ner/tree/main/data/lug) | | 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER | | 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER | | 'MIT_RESTAURANT_NER' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |