flairNLP · alanakbik · Oct 24, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
@@ -194,6 +194,7 @@
     NER_GERMAN_EUROPARL,
     NER_GERMAN_GERMEVAL,
     NER_GERMAN_LEGAL,
+    NER_GERMAN_MOBIE,
     NER_GERMAN_POLITICS,
     NER_HIPE_2022,
     NER_HUNGARIAN,
@@ -469,6 +470,7 @@
     "NER_GERMAN_EUROPARL",
     "NER_GERMAN_GERMEVAL",
     "NER_GERMAN_LEGAL",
+    "NER_GERMAN_MOBIE",
     "NER_GERMAN_POLITICS",
     "NER_HIPE_2022",
     "NER_HUNGARIAN",

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -4764,6 +4764,53 @@ def __init__(
         )
 
 
+class NER_GERMAN_MOBIE(ColumnCorpus):
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        **corpusargs,
+    ) -> None:
+        """Initialize the German MobIE NER dataset.
+
+        The German MobIE Dataset was introduced in the MobIE paper (https://aclanthology.org/2021.konvens-1.22/).
+
+        This is a German-language dataset that has been human-annotated with 20 coarse- and fine-grained entity types,
+        and it includes entity linking information for geographically linkable entities. The dataset comprises 3,232
+        social media texts and traffic reports, totaling 91K tokens, with 20.5K annotated entities, of which 13.1K are
+        linked to a knowledge base. In total, 20 different named entities are annotated.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        """
+        base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
+        dataset_name = self.__class__.__name__.lower()
+        data_folder = base_path / dataset_name
+        data_path = flair.cache_root / "datasets" / dataset_name
+
+        columns = {0: "text", 3: "ner"}
+
+        train_data_file = data_path / "train.conll2003"
+        if not train_data_file.is_file():
+            temp_file = cached_path(
+                "https://github.com/DFKI-NLP/MobIE/raw/master/v1_20210811/ner_conll03_formatted.zip",
+                Path("datasets") / dataset_name,
+            )
+            from zipfile import ZipFile
+
+            with ZipFile(temp_file, "r") as zip_file:
+                zip_file.extractall(path=data_path)
+
+        super().__init__(
+            data_folder,
+            columns,
+            in_memory=in_memory,
+            comment_symbol=None,
+            document_separator_token="-DOCSTART-",
+            **corpusargs,
+        )
+
+
 class MASAKHA_POS(MultiCorpus):
     def __init__(
         self,

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -872,6 +872,29 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
             check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
+def test_german_mobie(tasks_base_path):
+    corpus = flair.datasets.NER_GERMAN_MOBIE()
+
+    # See MobIE paper (https://aclanthology.org/2021.konvens-1.22/), table 2
+    ref_sentences = 7_077
+    ref_tokens = 90_971
+
+    actual_sentences = sum(
+        [1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
+    )
+    actual_tokens = sum(
+        [len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
+    )
+
+    assert ref_sentences == actual_sentences, (
+        f"Number of parsed sentences ({actual_sentences}) does not match with "
+        f"reported number of sentences ({ref_sentences})!"
+    )
+    assert (
+        ref_tokens == actual_tokens
+    ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],