diff --git a/flair/__init__.py b/flair/__init__.py
index 5702389c5..1dae2cd43 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -25,7 +25,7 @@
 
 import logging.config
 
-__version__ = "0.8"
+__version__ = "0.8.1"
 
 logging.config.dictConfig(
     {
diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 84846dcda..2f034057e 100755
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -14,12 +14,14 @@
 from .sequence_labeling import CONLL_03
 from .sequence_labeling import CONLL_03_GERMAN
 from .sequence_labeling import CONLL_03_DUTCH
+from .sequence_labeling import ICELANDIC_NER
 from .sequence_labeling import CONLL_03_SPANISH
 from .sequence_labeling import CONLL_2000
 from .sequence_labeling import DANE
 from .sequence_labeling import EUROPARL_NER_GERMAN
 from .sequence_labeling import GERMEVAL_14
 from .sequence_labeling import INSPEC
+from .sequence_labeling import JAPANESE_NER
 from .sequence_labeling import LER_GERMAN
 from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
 from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
@@ -56,6 +58,7 @@
 from .sequence_labeling import WSD_UFSAC
 from .sequence_labeling import WNUT_2020_NER
 from .sequence_labeling import XTREME
+from .sequence_labeling import REDDIT_EL_GOLD
 
 # Expose all document classification datasets
 from .document_classification import ClassificationCorpus
diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index adeb7f2c2..c5b3cce3c 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -11,7 +11,7 @@
     Corpus,
     Token,
     FlairDataset,
-    Tokenizer
+    Tokenizer, DataPair
 )
 from flair.tokenization import SegtokTokenizer, SpaceTokenizer
 from flair.datasets.base import find_train_dev_test_files
@@ -454,9 +454,12 @@ def __init__(
 
         # most data sets have the token text in the first column, if not, pass 'text' as column
         self.text_columns: List[int] = []
+        self.pair_columns: List[int] = []
         for column in column_name_map:
             if column_name_map[column] == "text":
                 self.text_columns.append(column)
+            if column_name_map[column] == "pair":
+                self.pair_columns.append(column)
 
         with open(self.path_to_file, encoding=encoding) as csv_file:
 
@@ -488,26 +491,8 @@ def __init__(
 
                 if self.in_memory:
 
-                    text = " ".join(
-                        [row[text_column] for text_column in self.text_columns]
-                    )
-
-                    if self.max_chars_per_doc > 0:
-                        text = text[: self.max_chars_per_doc]
-
-                    sentence = Sentence(text, use_tokenizer=self.tokenizer)
-
-                    for column in self.column_name_map:
-                        column_value = row[column]
-                        if (
-                                self.column_name_map[column].startswith("label")
-                                and column_value
-                        ):
-                            if column_value != self.no_class_label:
-                                sentence.add_label(label_type, column_value)
+                    sentence = self._make_labeled_data_point(row)
 
-                    if 0 < self.max_tokens_per_doc < len(sentence):
-                        sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
                     self.sentences.append(sentence)
 
                 else:
@@ -515,6 +500,52 @@ def __init__(
 
                 self.total_sentence_count += 1
 
+    def _make_labeled_data_point(self, row):
+
+        # make sentence from text (and filter for length)
+        text = " ".join(
+            [row[text_column] for text_column in self.text_columns]
+        )
+
+        if self.max_chars_per_doc > 0:
+            text = text[: self.max_chars_per_doc]
+
+        sentence = Sentence(text, use_tokenizer=self.tokenizer)
+
+        if 0 < self.max_tokens_per_doc < len(sentence):
+            sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
+
+        # if a pair column is defined, make a sentence pair object
+        if len(self.pair_columns) > 0:
+
+            text = " ".join(
+                [row[pair_column] for pair_column in self.pair_columns]
+            )
+
+            if self.max_chars_per_doc > 0:
+                text = text[: self.max_chars_per_doc]
+
+            pair = Sentence(text, use_tokenizer=self.tokenizer)
+
+            if 0 < self.max_tokens_per_doc < len(sentence):
+                pair.tokens = pair.tokens[: self.max_tokens_per_doc]
+
+            data_point = DataPair(first=sentence, second=pair)
+
+        else:
+            data_point = sentence
+
+        for column in self.column_name_map:
+            column_value = row[column]
+            if (
+                    self.column_name_map[column].startswith("label")
+                    and column_value
+            ):
+                if column_value != self.no_class_label:
+                    data_point.add_label(self.label_type, column_value)
+
+        return data_point
+
     def is_in_memory(self) -> bool:
         return self.in_memory
 
@@ -527,20 +558,7 @@ def __getitem__(self, index: int = 0) -> Sentence:
         else:
             row = self.raw_data[index]
 
-            text = " ".join([row[text_column] for text_column in self.text_columns])
-
-            if self.max_chars_per_doc > 0:
-                text = text[: self.max_chars_per_doc]
-
-            sentence = Sentence(text, use_tokenizer=self.tokenizer)
-            for column in self.column_name_map:
-                column_value = row[column]
-                if self.column_name_map[column].startswith("label") and column_value:
-                    if column_value != self.no_class_label:
-                        sentence.add_label(self.label_type, column_value)
-
-            if 0 < self.max_tokens_per_doc < len(sentence):
-                sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
+            sentence = self._make_labeled_data_point(row)
 
             return sentence
 
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index f1f47427d..1dae398dc 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -2,8 +2,14 @@
 import re
 import os
 import shutil
+import glob
 from pathlib import Path
 from typing import Union, Dict, List
+from os import  listdir
+import zipfile
+from zipfile import ZipFile
+import csv
+
 
 import flair
 from flair.data import Corpus, MultiCorpus, FlairDataset, Sentence, Token
@@ -592,6 +598,152 @@ def __offset_docstarts(file_in: Union[str, Path], file_out: Union[str, Path]):
                     f.write("\n")
 
 
+
+class ICELANDIC_NER(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the ICELANDIC_NER corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        if not os.path.isfile(data_folder / 'icelandic_ner.txt'):
+            # download zip
+            icelandic_ner ="https://repository.clarin.is/repository/xmlui/handle/20.500.12537/42/allzip"
+            icelandic_ner_path = cached_path(icelandic_ner, Path("datasets") / dataset_name)
+
+            #unpacking the zip
+            unpack_file(
+                  icelandic_ner_path,
+                  data_folder,
+                  mode="zip",
+                  keep=True
+              )
+        outputfile = os.path.abspath(data_folder)
+
+        #merge the files in one as the zip is containing multiples files
+
+        with open(outputfile/data_folder/"icelandic_ner.txt", "wb") as outfile:
+            for files in os.walk(outputfile/data_folder):
+                f = files[2]
+                for i in range(len(f)):
+                    if f[i].endswith('.txt'):
+                        with open(outputfile/data_folder/f[i], 'rb') as infile:
+                            contents = infile.read()
+                        outfile.write(contents)
+
+
+        super(ICELANDIC_NER, self).__init__(
+            data_folder,
+            columns,
+            train_file='icelandic_ner.txt',
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            **corpusargs,
+        )
+
+class JAPANESE_NER(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: 'text', 1: 'ner'}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data from github if necessary (hironsan.txt, ja.wikipedia.conll)
+        IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/"
+
+        # download files if not present locally
+        cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw')
+        cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw')
+
+        # we need to modify the original files by adding new lines after after the end of each sentence
+        train_data_file = data_folder / 'train.txt'
+        if not train_data_file.is_file():
+            self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt')
+            self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt')
+
+        super(JAPANESE_NER, self).__init__(
+            data_folder,
+            columns,
+            train_file='train.txt',
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            **corpusargs,
+        )
+
+    @staticmethod
+    def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
+        with open(file_in, 'r') as f:
+            lines = f.readlines()
+        with open(file_out, 'a') as f:
+            for line in lines:
+                if (line[0] == "。"):
+                    f.write(line)
+                    f.write("\n")
+                elif (line[0] == "\n"):
+                    continue
+                else:
+                    f.write(line)
+
+    @staticmethod
+    def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
+        with open(file_in, 'r') as f:
+            lines = f.readlines()
+        with open(file_out, 'a') as f:
+            for line in lines:
+                sp_line = line.split("\t")
+                if (sp_line[0] == "\n"):
+                    f.write("\n")
+                else:
+                    f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1])
+
 class STACKOVERFLOW_NER(ColumnCorpus):
     def __init__(
             self,
@@ -3445,3 +3597,222 @@ def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
             else:
                 liste = line.split()
                 f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
+
+
+class REDDIT_EL_GOLD(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format. 
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download and parse data if necessary
+        reddit_el_path = "https://zenodo.org/record/3970806/files/reddit_el.zip"
+        corpus_file_name = "reddit_el_gold.txt"
+        parsed_dataset = data_folder / corpus_file_name
+
+        if not parsed_dataset.exists():
+            reddit_el_zip = cached_path(f"{reddit_el_path}", Path("datasets") / dataset_name)
+            unpack_file(reddit_el_zip, data_folder, "zip", False)
+
+            with open(data_folder / corpus_file_name, "w") as txtout:
+
+                # First parse the post titles
+                with open(data_folder / "posts.tsv", "r") as tsvin1, open(data_folder / "gold_post_annotations.tsv", "r") as tsvin2:
+
+                    posts = csv.reader(tsvin1, delimiter="\t")
+                    self.post_annotations = csv.reader(tsvin2, delimiter="\t")
+                    self.curr_annot = next(self.post_annotations)
+
+                    for row in posts: # Go through all the post titles
+
+                        txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token
+
+                        # Keep track of how many and which entity mentions does a given post title have
+                        link_annots = [] # [start pos, end pos, wiki page title] of an entity mention
+
+                        # Check if the current post title has an entity link and parse accordingly
+                        if row[0] == self.curr_annot[0]:
+
+                            link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3]))
+                            link_annots = self._fill_annot_array(link_annots, row[0], post_flag = True)
+
+                            # Post titles with entity mentions (if any) are handled via this function
+                            self._text_to_cols(Sentence(row[2], use_tokenizer = True), link_annots, txtout)
+                        else:
+                            self._text_to_cols(Sentence(row[2], use_tokenizer = True), link_annots, txtout)
+
+                # Then parse the comments
+                with open(data_folder / "comments.tsv", "r") as tsvin3, open(data_folder / "gold_comment_annotations.tsv", "r") as tsvin4:
+
+                    self.comments = csv.reader(tsvin3, delimiter="\t")
+                    self.comment_annotations = csv.reader(tsvin4, delimiter="\t")
+                    self.curr_annot = next(self.comment_annotations)
+                    self.curr_row = next(self.comments)
+                    self.stop_iter = False
+
+                    # Iterate over the comments.tsv file, until the end is reached
+                    while not self.stop_iter:
+
+                        txtout.writelines("-DOCSTART-\n") # Start each comment thread with a -DOCSTART- token
+
+                        # Keep track of the current comment thread and its corresponding key, on which the annotations are matched.
+                        # Each comment thread is handled as one 'document'.
+                        self.curr_comm = self.curr_row[4] 
+                        comm_key = self.curr_row[0]
+
+                        # Python's csv package for some reason fails to correctly parse a handful of rows inside the comments.tsv file.
+                        # This if-condition is needed to handle this problem.
+                        if comm_key in {"en5rf4c", "es3ia8j", "es3lrmw"}:
+                            if comm_key == "en5rf4c":
+                                self.parsed_row = (r.split("\t") for r in self.curr_row[4].split("\n"))
+                                self.curr_comm = next(self.parsed_row)
+                            self._fill_curr_comment(fix_flag = True)
+                        # In case we are dealing with properly parsed rows, proceed with a regular parsing procedure
+                        else:
+                            self._fill_curr_comment(fix_flag = False)
+
+                        link_annots = [] # [start pos, end pos, wiki page title] of an entity mention
+
+                        # Check if the current comment thread has an entity link and parse accordingly, same as with post titles above
+                        if comm_key == self.curr_annot[0]:
+                            link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3]))
+                            link_annots = self._fill_annot_array(link_annots, comm_key, post_flag = False)
+                            self._text_to_cols(Sentence(self.curr_comm, use_tokenizer = True), link_annots, txtout)
+                        else:
+                            # In two of the comment thread a case of capital letter spacing occurs, which the SegtokTokenizer cannot properly handle.
+                            # The following if-elif condition handles these two cases and as result writes full capitalized words in each corresponding row, 
+                            # and not just single letters into single rows.
+                            if comm_key == "dv74ybb":
+                                self.curr_comm = " ".join([word.replace(" ", "") for word in self.curr_comm.split("  ")])
+                            elif comm_key == "eci2lut":
+                                self.curr_comm = (self.curr_comm[:18] + self.curr_comm[18:27].replace(" ", "") + self.curr_comm[27:55] + 
+                                self.curr_comm[55:68].replace(" ", "") + self.curr_comm[68:85] + self.curr_comm[85:92].replace(" ", "") + 
+                                self.curr_comm[92:])
+
+                            self._text_to_cols(Sentence(self.curr_comm, use_tokenizer = True), link_annots, txtout)
+
+        super(REDDIT_EL_GOLD, self).__init__(
+            data_folder,
+            columns,
+            train_file=corpus_file_name,
+            column_delimiter="\t",
+            in_memory=in_memory,
+            document_separator_token="-DOCSTART-",
+            **corpusargs,
+        )
+
+    def _text_to_cols(self, sentence: Sentence, links: list, outfile):
+        """
+        Convert a tokenized sentence into column format
+        :param sentence: Flair Sentence object containing a tokenized post title or comment thread
+        :param links: array containing information about the starting and ending position of an entity mention, as well
+        as its corresponding wiki tag
+        :param outfile: file, to which the output is written
+        """
+        for i in range(0, len(sentence)):
+            # If there are annotated entity mentions for given post title or a comment thread
+            if links:
+                # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence
+                link_index = [j for j,v in enumerate(links) if (sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1])]
+                # Write the token with a corresponding tag to file
+                try:
+                    if any(sentence[i].start_pos == v[0] and sentence[i].end_pos == v[1] for j,v in enumerate(links)):
+                        outfile.writelines(sentence[i].text + "\tS-Link:" + links[link_index[0]][2] + "\n")
+                    elif any(sentence[i].start_pos == v[0] and sentence[i].end_pos != v[1] for j,v in enumerate(links)):
+                        outfile.writelines(sentence[i].text + "\tB-Link:" + links[link_index[0]][2] + "\n")
+                    elif any(sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1] for j,v in enumerate(links)):
+                        outfile.writelines(sentence[i].text + "\tI-Link:" + links[link_index[0]][2] + "\n")
+                    else:
+                        outfile.writelines(sentence[i].text + "\tO\n")
+                # IndexError is raised in cases when there is exactly one link in a sentence, therefore can be dismissed
+                except IndexError:
+                    pass
+
+            # If a comment thread or a post title has no entity link, all tokens are assigned the O tag
+            else:
+                outfile.writelines(sentence[i].text + "\tO\n")
+
+            # Prevent writing empty lines if e.g. a quote comes after a dot or initials are tokenized
+            # incorrectly, in order to keep the desired format (empty line as a sentence separator).
+            try:
+                if ((sentence[i].text in {".", "!", "?", "!*"}) and
+                    (sentence[i+1].text not in {'"', '“', "'", "''", "!", "?", ";)", "."}) and 
+                    ("." not in sentence[i-1].text)):
+                    outfile.writelines("\n")
+            except IndexError: 
+            # Thrown when the second check above happens, but the last token of a sentence is reached.
+            # Indicates that the EOS punctuaion mark is present, therefore an empty line needs to be written below.
+                outfile.writelines("\n")
+
+        # If there is no punctuation mark indicating EOS, an empty line is still needed after the EOS
+        if sentence[-1].text not in {".", "!", "?"}:
+            outfile.writelines("\n")
+
+    def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list:
+        """
+        Fills the array containing information about the entity mention annotations, used in the _text_to_cols method
+        :param annot_array: array to be filled
+        :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
+        :param post_flag: flag indicating whether the annotations are collected for the post titles (=True)
+        or comment threads (=False)
+        """
+        next_annot = None
+        while True:
+            # Check if further annotations belong to the current post title or comment thread as well
+            try:
+                next_annot = next(self.post_annotations) if post_flag else next(self.comment_annotations)
+                if next_annot[0] == key:
+                    annot_array.append((int(next_annot[4]), int(next_annot[5]), next_annot[3]))
+                else:
+                    self.curr_annot = next_annot
+                    break
+            # Stop when the end of an annotation file is reached
+            except StopIteration:
+                break
+        return annot_array
+
+    def _fill_curr_comment(self, fix_flag: bool):
+        """
+        Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the
+        comments are parsed.
+        :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True)
+        or regular rows (=False)
+        """
+        next_row = None
+        while True:
+            # Check if further annotations belong to the current sentence as well
+            try:
+                next_row = next(self.comments) if not fix_flag else next(self.parsed_row)
+                if len(next_row) < 2: 
+                    # 'else "  "' is needed to keep the proper token positions (for accordance with annotations)
+                    self.curr_comm += next_row[0] if any(next_row) else "  "
+                else:
+                    self.curr_row = next_row
+                    break
+            except StopIteration: # When the end of the comments.tsv file is reached
+                self.curr_row = next_row
+                self.stop_iter = True if not fix_flag else False
+                break
diff --git a/flair/embeddings/__init__.py b/flair/embeddings/__init__.py
index 9776320ad..84a064d01 100644
--- a/flair/embeddings/__init__.py
+++ b/flair/embeddings/__init__.py
@@ -26,6 +26,7 @@
 from .document import DocumentTFIDFEmbeddings
 from .document import DocumentRNNEmbeddings
 from .document import DocumentLMEmbeddings
+from .document import DocumentCNNEmbeddings
 from .document import SentenceTransformerDocumentEmbeddings
 
 # Expose image embedding classes
diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index f743aed7f..9811bb377 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -217,7 +217,8 @@ def __setstate__(self, d):
         if "config_state_dict" in d:
 
             # load transformer model
-            config_class = CONFIG_MAPPING[d["config_state_dict"]["model_type"]]
+            model_type = d["config_state_dict"]["model_type"] if "model_type" in d["config_state_dict"] else "bert"
+            config_class = CONFIG_MAPPING[model_type]
             loaded_config = config_class.from_dict(d["config_state_dict"])
 
             # constructor arguments
@@ -585,6 +586,59 @@ def _apply(self, fn):
 
             child_module._apply(fn)
 
+    def __getstate__(self):
+
+        # serialize the language models and the constructor arguments (but nothing else)
+        model_state = {
+            "state_dict": self.state_dict(),
+
+            "embeddings": self.embeddings.embeddings,
+            "hidden_size": self.rnn.hidden_size,
+            "rnn_layers": self.rnn.num_layers,
+            "reproject_words": self.reproject_words,
+            "reproject_words_dimension": self.embeddings_dimension,
+            "bidirectional": self.bidirectional,
+            "dropout": self.dropout.p if self.dropout is not None else 0.,
+            "word_dropout": self.word_dropout.p if self.word_dropout is not None else 0.,
+            "locked_dropout": self.locked_dropout.p if self.locked_dropout is not None else 0.,
+            "rnn_type": self.rnn_type,
+            "fine_tune": not self.static_embeddings,
+        }
+
+        return model_state
+
+    def __setstate__(self, d):
+
+        # special handling for deserializing language models
+        if "state_dict" in d:
+
+            # re-initialize language model with constructor arguments
+            language_model = DocumentRNNEmbeddings(
+                embeddings=d['embeddings'],
+                hidden_size=d['hidden_size'],
+                rnn_layers=d['rnn_layers'],
+                reproject_words=d['reproject_words'],
+                reproject_words_dimension=d['reproject_words_dimension'],
+                bidirectional=d['bidirectional'],
+                dropout=d['dropout'],
+                word_dropout=d['word_dropout'],
+                locked_dropout=d['locked_dropout'],
+                rnn_type=d['rnn_type'],
+                fine_tune=d['fine_tune'],
+            )
+
+            language_model.load_state_dict(d['state_dict'])
+
+            # copy over state dictionary to self
+            for key in language_model.__dict__.keys():
+                self.__dict__[key] = language_model.__dict__[key]
+
+            # set the language model to eval() by default (this is necessary since FlairEmbeddings "protect" the LM
+            # in their "self.train()" method)
+            self.eval()
+
+        else:
+            self.__dict__ = d
 
 class DocumentLMEmbeddings(DocumentEmbeddings):
     def __init__(self, flair_embeddings: List[FlairEmbeddings]):
@@ -688,3 +742,162 @@ def _add_embeddings_to_sentences(self, sentences: List[Sentence]):
     def embedding_length(self) -> int:
         """Returns the length of the embedding vector."""
         return self.model.get_sentence_embedding_dimension()
+
+
+class DocumentCNNEmbeddings(DocumentEmbeddings):
+    def __init__(
+            self,
+            embeddings: List[TokenEmbeddings],
+            kernels=((100, 3), (100, 4), (100, 5)),
+            reproject_words: bool = True,
+            reproject_words_dimension: int = None,
+            dropout: float = 0.5,
+            word_dropout: float = 0.0,
+            locked_dropout: float = 0.0,
+            fine_tune: bool = True,
+    ):
+        """The constructor takes a list of embeddings to be combined.
+        :param embeddings: a list of token embeddings
+        :param kernels: list of (number of kernels, kernel size)
+        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
+        layer before putting them into the rnn or not
+        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
+        dimension as before will be taken.
+        :param dropout: the dropout value to be used
+        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+        """
+        super().__init__()
+
+        self.embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings)
+        self.length_of_all_token_embeddings: int = self.embeddings.embedding_length
+
+        self.kernels = kernels
+        self.reproject_words = reproject_words
+
+        self.static_embeddings = False if fine_tune else True
+
+        self.embeddings_dimension: int = self.length_of_all_token_embeddings
+        if self.reproject_words and reproject_words_dimension is not None:
+            self.embeddings_dimension = reproject_words_dimension
+
+        self.word_reprojection_map = torch.nn.Linear(
+            self.length_of_all_token_embeddings, self.embeddings_dimension
+        )
+
+        # CNN
+        self.__embedding_length: int = sum([kernel_num for kernel_num, kernel_size in self.kernels])
+        self.convs = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(self.embeddings_dimension, kernel_num, kernel_size) for kernel_num, kernel_size in self.kernels
+            ]
+        )
+        self.pool = torch.nn.AdaptiveMaxPool1d(1)
+
+        self.name = "document_cnn"
+
+        # dropouts
+        self.dropout = torch.nn.Dropout(dropout) if dropout > 0.0 else None
+        self.locked_dropout = (
+            LockedDropout(locked_dropout) if locked_dropout > 0.0 else None
+        )
+        self.word_dropout = WordDropout(word_dropout) if word_dropout > 0.0 else None
+
+        torch.nn.init.xavier_uniform_(self.word_reprojection_map.weight)
+
+        self.to(flair.device)
+
+        self.eval()
+
+    @property
+    def embedding_length(self) -> int:
+        return self.__embedding_length
+
+    def _add_embeddings_internal(self, sentences: Union[List[Sentence], Sentence]):
+        """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update
+         only if embeddings are non-static."""
+
+        # TODO: remove in future versions
+        if not hasattr(self, "locked_dropout"):
+            self.locked_dropout = None
+        if not hasattr(self, "word_dropout"):
+            self.word_dropout = None
+
+        if type(sentences) is Sentence:
+            sentences = [sentences]
+
+        self.zero_grad()  # is it necessary?
+
+        # embed words in the sentence
+        self.embeddings.embed(sentences)
+
+        lengths: List[int] = [len(sentence.tokens) for sentence in sentences]
+        longest_token_sequence_in_batch: int = max(lengths)
+
+        pre_allocated_zero_tensor = torch.zeros(
+            self.embeddings.embedding_length * longest_token_sequence_in_batch,
+            dtype=torch.float,
+            device=flair.device,
+        )
+
+        all_embs: List[torch.Tensor] = list()
+        for sentence in sentences:
+            all_embs += [
+                emb for token in sentence for emb in token.get_each_embedding()
+            ]
+            nb_padding_tokens = longest_token_sequence_in_batch - len(sentence)
+
+            if nb_padding_tokens > 0:
+                t = pre_allocated_zero_tensor[
+                    : self.embeddings.embedding_length * nb_padding_tokens
+                    ]
+                all_embs.append(t)
+
+        sentence_tensor = torch.cat(all_embs).view(
+            [
+                len(sentences),
+                longest_token_sequence_in_batch,
+                self.embeddings.embedding_length,
+            ]
+        )
+
+        # before-RNN dropout
+        if self.dropout:
+            sentence_tensor = self.dropout(sentence_tensor)
+        if self.locked_dropout:
+            sentence_tensor = self.locked_dropout(sentence_tensor)
+        if self.word_dropout:
+            sentence_tensor = self.word_dropout(sentence_tensor)
+
+        # reproject if set
+        if self.reproject_words:
+            sentence_tensor = self.word_reprojection_map(sentence_tensor)
+
+        # push CNN
+        x = sentence_tensor
+        x = x.permute(0, 2, 1)
+
+        rep = [self.pool(torch.nn.functional.relu(conv(x))) for conv in self.convs]
+        outputs = torch.cat(rep, 1)
+
+        outputs = outputs.reshape(outputs.size(0), -1)
+
+        # after-CNN dropout
+        if self.dropout:
+            outputs = self.dropout(outputs)
+        if self.locked_dropout:
+            outputs = self.locked_dropout(outputs)
+
+        # extract embeddings from CNN
+        for sentence_no, length in enumerate(lengths):
+            embedding = outputs[sentence_no]
+
+            if self.static_embeddings:
+                embedding = embedding.detach()
+
+            sentence = sentences[sentence_no]
+            sentence.set_embedding(self.name, embedding)
+
+    def _apply(self, fn):
+        for child_module in self.children():
+            child_module._apply(fn)
\ No newline at end of file
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 4692dd402..c0d74bc82 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -409,8 +409,8 @@ def __init__(self,
             "multi-backward": f"{hu_path}/lm-jw300-backward-v0.1.pt",
             "multi-v0-forward": f"{hu_path}/lm-multi-forward-v0.1.pt",
             "multi-v0-backward": f"{hu_path}/lm-multi-backward-v0.1.pt",
-            "multi-v0-forward-fast": f"{hu_path}/lm-multi-forward-fast-v0.1.pt",
-            "multi-v0-backward-fast": f"{hu_path}/lm-multi-backward-fast-v0.1.pt",
+            "multi-forward-fast": f"{hu_path}/lm-multi-forward-fast-v0.1.pt",
+            "multi-backward-fast": f"{hu_path}/lm-multi-backward-fast-v0.1.pt",
             # English models
             "en-forward": f"{hu_path}/news-forward-0.4.1.pt",
             "en-backward": f"{hu_path}/news-backward-0.4.1.pt",
@@ -591,6 +591,7 @@ def train(self, mode=True):
         if "chars_per_chunk" not in self.__dict__:
             self.chars_per_chunk = 512
 
+        # unless fine-tuning is set, do not set language model to train() in order to disallow language model dropout
         if not self.fine_tune:
             pass
         else:
@@ -1274,7 +1275,8 @@ def __setstate__(self, d):
         if "config_state_dict" in d:
 
             # load transformer model
-            config_class = CONFIG_MAPPING[d["config_state_dict"]["model_type"]]
+            model_type = d["config_state_dict"]["model_type"] if "model_type" in d["config_state_dict"] else "bert"
+            config_class = CONFIG_MAPPING[model_type]
             loaded_config = config_class.from_dict(d["config_state_dict"])
 
             # constructor arguments
diff --git a/flair/models/language_model.py b/flair/models/language_model.py
index 1c632c2da..27f4b245e 100644
--- a/flair/models/language_model.py
+++ b/flair/models/language_model.py
@@ -399,6 +399,54 @@ def calculate_perplexity(self, text: str) -> float:
 
         return perplexity
 
+    def __getstate__(self):
+
+        # serialize the language models and the constructor arguments (but nothing else)
+        model_state = {
+            "state_dict": self.state_dict(),
+
+            "dictionary": self.dictionary,
+            "is_forward_lm": self.is_forward_lm,
+            "hidden_size": self.hidden_size,
+            "nlayers": self.nlayers,
+            "embedding_size": self.embedding_size,
+            "nout": self.nout,
+            "document_delimiter": self.document_delimiter,
+            "dropout": self.dropout,
+        }
+
+        return model_state
+
+    def __setstate__(self, d):
+
+        # special handling for deserializing language models
+        if "state_dict" in d:
+
+            # re-initialize language model with constructor arguments
+            language_model = LanguageModel(
+                dictionary=d['dictionary'],
+                is_forward_lm=d['is_forward_lm'],
+                hidden_size=d['hidden_size'],
+                nlayers=d['nlayers'],
+                embedding_size=d['embedding_size'],
+                nout=d['nout'],
+                document_delimiter=d['document_delimiter'],
+                dropout=d['dropout'],
+            )
+
+            language_model.load_state_dict(d['state_dict'])
+
+            # copy over state dictionary to self
+            for key in language_model.__dict__.keys():
+                self.__dict__[key] = language_model.__dict__[key]
+
+            # set the language model to eval() by default (this is necessary since FlairEmbeddings "protect" the LM
+            # in their "self.train()" method)
+            self.eval()
+
+        else:
+            self.__dict__ = d
+
     def _apply(self, fn):
 
         # models that were serialized using torch versions older than 1.4.0 lack the _flat_weights_names attribute
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 3e95153f1..884a05f38 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -53,7 +53,7 @@ def __init__(
 
         super(TextClassifier, self).__init__()
 
-        self.document_embeddings: flair.embeddings.DocumentRNNEmbeddings = document_embeddings
+        self.document_embeddings: flair.embeddings.DocumentEmbeddings = document_embeddings
         self.label_dictionary: Dictionary = label_dictionary
         self.label_type = label_type
 
@@ -474,7 +474,7 @@ def _fetch_model(model_name) -> str:
         hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"
 
         model_map["de-offensive-language"] = "/".join(
-            [hu_path, "de-offensive-language", "germ-eval-2018-task-1-v0.5.pt"]
+            [hu_path, "de-offensive-language", "germ-eval-2018-task-1-v0.8.pt"]
         )
 
         # English sentiment models
@@ -485,7 +485,7 @@ def _fetch_model(model_name) -> str:
             [hu_path, "sentiment-curated-distilbert", "sentiment-en-mix-distillbert_4.pt"]
         )
         model_map["sentiment-fast"] = "/".join(
-            [hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn.pt"]
+            [hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn_v8.pt"]
         )
 
         # Communicative Functions Model
@@ -558,6 +558,15 @@ def __init__(
 
             nn.init.xavier_uniform_(self.decoder.weight)
 
+        # else, set separator to concatenate two sentences
+        else:
+            self.sep = ' '
+            if isinstance(self.document_embeddings, flair.embeddings.document.TransformerDocumentEmbeddings):
+                if self.document_embeddings.tokenizer.sep_token:
+                    self.sep = ' ' + str(self.document_embeddings.tokenizer.sep_token) + ' '
+                else:
+                    self.sep = ' [SEP] '
+
     def _get_state_dict(self):
         model_state = super()._get_state_dict()
         model_state["bi_mode"] = self.bi_mode
@@ -605,15 +614,12 @@ def forward(self, datapairs):
 
         else:  # concatenate the sentences and embed together
 
-            # TODO: Transformers use special separator symbols in the beginning and between elements
-            #      of datapair. Here should be a case dinstintion between the different transformers.
-            if isinstance(self.document_embeddings, flair.embeddings.document.TransformerDocumentEmbeddings):
-                sep = '[SEP]'
-            else:
-                sep = ' '
-
-            concatenated_sentences = [Sentence(pair.first.to_plain_string() + sep + pair.second.to_plain_string()) for
-                                      pair in datapairs]
+            concatenated_sentences = [
+                Sentence(
+                    pair.first.to_tokenized_string() + self.sep + pair.second.to_tokenized_string(),
+                    use_tokenizer=False
+                )
+                for pair in datapairs]
 
             self.document_embeddings.embed(concatenated_sentences)
 
@@ -900,11 +906,11 @@ def _get_state_dict(self):
     def _init_model_with_state_dict(state):
         task_name = state["current_task"]
         print("init TARS")
-        
+
         # init new TARS classifier
         model = TARSClassifier(
             task_name,
-            label_dictionary = state["task_specific_attributes"][task_name]['label_dictionary'],
+            label_dictionary=state["task_specific_attributes"][task_name]['label_dictionary'],
             document_embeddings=state["tars_model"].document_embeddings,
             num_negative_labels_to_sample=state["num_negative_labels_to_sample"],
         )
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index f797ff2b0..8233112ed 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -39,12 +39,12 @@
 
 class ModelTrainer:
     def __init__(
-        self,
-        model: flair.nn.Model,
-        corpus: Corpus,
-        optimizer: torch.optim.Optimizer = SGD,
-        epoch: int = 0,
-        use_tensorboard: bool = False,
+            self,
+            model: flair.nn.Model,
+            corpus: Corpus,
+            optimizer: torch.optim.Optimizer = SGD,
+            epoch: int = 0,
+            use_tensorboard: bool = False,
     ):
         """
         Initialize a model trainer
@@ -61,40 +61,40 @@ def __init__(
         self.use_tensorboard: bool = use_tensorboard
 
     def train(
-        self,
-        base_path: Union[Path, str],
-        learning_rate: float = 0.1,
-        mini_batch_size: int = 32,
-        mini_batch_chunk_size: int = None,
-        max_epochs: int = 100,
-        scheduler = AnnealOnPlateau,
-        cycle_momentum: bool = False,
-        anneal_factor: float = 0.5,
-        patience: int = 3,
-        initial_extra_patience = 0,
-        min_learning_rate: float = 0.0001,
-        train_with_dev: bool = False,
-        train_with_test: bool = False,
-        monitor_train: bool = False,
-        monitor_test: bool = False,
-        embeddings_storage_mode: str = "cpu",
-        checkpoint: bool = False,
-        save_final_model: bool = True,
-        anneal_with_restarts: bool = False,
-        anneal_with_prestarts: bool = False,
-        batch_growth_annealing: bool = False,
-        shuffle: bool = True,
-        param_selection_mode: bool = False,
-        write_weights: bool = False,
-        num_workers: int = 6,
-        sampler=None,
-        use_amp: bool = False,
-        amp_opt_level: str = "O1",
-        eval_on_train_fraction=0.0,
-        eval_on_train_shuffle=False,
-        save_model_at_each_epoch=False,
-        main_score_type=("micro avg", 'f1-score'),
-        **kwargs,
+            self,
+            base_path: Union[Path, str],
+            learning_rate: float = 0.1,
+            mini_batch_size: int = 32,
+            mini_batch_chunk_size: int = None,
+            max_epochs: int = 100,
+            scheduler=AnnealOnPlateau,
+            cycle_momentum: bool = False,
+            anneal_factor: float = 0.5,
+            patience: int = 3,
+            initial_extra_patience=0,
+            min_learning_rate: float = 0.0001,
+            train_with_dev: bool = False,
+            train_with_test: bool = False,
+            monitor_train: bool = False,
+            monitor_test: bool = False,
+            embeddings_storage_mode: str = "cpu",
+            checkpoint: bool = False,
+            save_final_model: bool = True,
+            anneal_with_restarts: bool = False,
+            anneal_with_prestarts: bool = False,
+            batch_growth_annealing: bool = False,
+            shuffle: bool = True,
+            param_selection_mode: bool = False,
+            write_weights: bool = False,
+            num_workers: int = 6,
+            sampler=None,
+            use_amp: bool = False,
+            amp_opt_level: str = "O1",
+            eval_on_train_fraction=0.0,
+            eval_on_train_shuffle=False,
+            save_model_each_k_epochs: int = 0,
+            main_score_type=("micro avg", 'f1-score'),
+            **kwargs,
     ) -> dict:
         """
         Trains any class that implements the flair.nn.Model interface.
@@ -127,7 +127,9 @@ def train(
         if 'dev' the size is determined from dev set size
         :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training
         and kept fixed during training, otherwise it's sampled at beginning of each epoch
-        :param save_model_at_each_epoch: If True, at each epoch the thus far trained model will be saved
+        :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
+        be saved each 5 epochs. Default is 0 which means no model saving.
+        :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved
         :param main_score_type: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used)
         :param kwargs: Other arguments for the Optimizer
         :return:
@@ -237,17 +239,18 @@ def train(
 
         # minimize training loss if training with dev data, else maximize dev score
         anneal_mode = "min" if train_with_dev else "max"
-        
+
         if scheduler == OneCycleLR:
             dataset_size = len(self.corpus.train)
             if train_with_dev:
                 dataset_size += len(self.corpus.dev)
             lr_scheduler = OneCycleLR(optimizer,
-                                   max_lr=learning_rate,
-                                   steps_per_epoch=dataset_size//mini_batch_size + 1,
-                                   epochs=max_epochs-self.epoch, # if we load a checkpoint, we have already trained for self.epoch
-                                   pct_start=0.0,
-                                   cycle_momentum=cycle_momentum)
+                                      max_lr=learning_rate,
+                                      steps_per_epoch=dataset_size // mini_batch_size + 1,
+                                      epochs=max_epochs - self.epoch,
+                                      # if we load a checkpoint, we have already trained for self.epoch
+                                      pct_start=0.0,
+                                      cycle_momentum=cycle_momentum)
         else:
             lr_scheduler = scheduler(
                 optimizer,
@@ -257,7 +260,7 @@ def train(
                 mode=anneal_mode,
                 verbose=True,
             )
-        
+
         if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing):
             raise ValueError("Batch growth with OneCycle policy is not implemented.")
 
@@ -318,9 +321,9 @@ def train(
 
                 # reload last best model if annealing with restarts is enabled
                 if (
-                    (anneal_with_restarts or anneal_with_prestarts)
-                    and learning_rate != previous_learning_rate
-                    and (base_path / "best-model.pt").exists()
+                        (anneal_with_restarts or anneal_with_prestarts)
+                        and learning_rate != previous_learning_rate
+                        and (base_path / "best-model.pt").exists()
                 ):
                     if anneal_with_restarts:
                         log.info("resetting to best model")
@@ -345,7 +348,7 @@ def train(
                 batch_loader = DataLoader(
                     train_data,
                     batch_size=mini_batch_size,
-                    shuffle=shuffle if self.epoch > 1 else False, # never shuffle the first epoch
+                    shuffle=shuffle if self.epoch > 1 else False,  # never shuffle the first epoch
                     num_workers=num_workers,
                     sampler=sampler,
                 )
@@ -373,7 +376,7 @@ def train(
                     batch_steps = [batch]
                     if len(batch) > micro_batch_size:
                         batch_steps = [
-                            batch[x : x + micro_batch_size]
+                            batch[x: x + micro_batch_size]
                             for x in range(0, len(batch), micro_batch_size)
                         ]
 
@@ -393,7 +396,7 @@ def train(
                     # do the optimizer step
                     torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
                     optimizer.step()
-                    
+
                     # do the scheduler step if one-cycle
                     if isinstance(lr_scheduler, OneCycleLR):
                         lr_scheduler.step()
@@ -401,7 +404,7 @@ def train(
                         for group in optimizer.param_groups:
                             learning_rate = group["lr"]
                             if "momentum" in group:
-                                momentum = group["momentum"]                    
+                                momentum = group["momentum"]
 
                     seen_batches += 1
                     train_loss += loss.item()
@@ -591,11 +594,11 @@ def train(
 
                 # if we use dev data, remember best model based on dev evaluation score
                 if (
-                    (not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
-                    and not param_selection_mode
-                    and not isinstance(lr_scheduler, OneCycleLR)
-                    and current_score == lr_scheduler.best
-                    and bad_epochs == 0
+                        (not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
+                        and not param_selection_mode
+                        and not isinstance(lr_scheduler, OneCycleLR)
+                        and current_score == lr_scheduler.best
+                        and bad_epochs == 0
                 ):
                     print("saving best model")
                     self.model.save(base_path / "best-model.pt")
@@ -605,8 +608,8 @@ def train(
                         self.model.load_state_dict(last_epoch_model_state_dict)
                         self.model.save(base_path / "pre-best-model.pt")
                         self.model.load_state_dict(current_state_dict)
-                        
-                if save_model_at_each_epoch:
+
+                if save_model_each_k_epochs > 0 and not self.epoch % save_model_each_k_epochs:
                     print("saving model of current epoch")
                     model_name = "model_epoch_" + str(self.epoch) + ".pt"
                     self.model.save(base_path / model_name)
@@ -659,7 +662,7 @@ def load_checkpoint(cls, checkpoint: Union[Path, str], corpus: Corpus):
         return model
 
     def final_test(
-        self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8
+            self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8
     ):
         if type(base_path) is str:
             base_path = Path(base_path)
@@ -708,16 +711,16 @@ def final_test(
         return final_score
 
     def find_learning_rate(
-        self,
-        base_path: Union[Path, str],
-        file_name: str = "learning_rate.tsv",
-        start_learning_rate: float = 1e-7,
-        end_learning_rate: float = 10,
-        iterations: int = 100,
-        mini_batch_size: int = 32,
-        stop_early: bool = True,
-        smoothing_factor: float = 0.98,
-        **kwargs,
+            self,
+            base_path: Union[Path, str],
+            file_name: str = "learning_rate.tsv",
+            start_learning_rate: float = 1e-7,
+            end_learning_rate: float = 10,
+            iterations: int = 100,
+            mini_batch_size: int = 32,
+            stop_early: bool = True,
+            smoothing_factor: float = 0.98,
+            **kwargs,
     ) -> Path:
         best_loss = None
         moving_avg_loss = 0
@@ -768,11 +771,11 @@ def find_learning_rate(
                 else:
                     if smoothing_factor > 0:
                         moving_avg_loss = (
-                            smoothing_factor * moving_avg_loss
-                            + (1 - smoothing_factor) * loss_item
+                                smoothing_factor * moving_avg_loss
+                                + (1 - smoothing_factor) * loss_item
                         )
                         loss_item = moving_avg_loss / (
-                            1 - smoothing_factor ** (step + 1)
+                                1 - smoothing_factor ** (step + 1)
                         )
                     if loss_item < best_loss:
                         best_loss = loss
diff --git a/requirements.txt b/requirements.txt
index 128ef661c..e66f458d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,5 +20,4 @@ sentencepiece==0.1.95
 konoha<5.0.0,>=4.0.0
 janome
 gdown==3.12.2
-numpy<1.20.0
 huggingface-hub
diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index cc0af3f4c..80531ebc2 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -173,6 +173,7 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'CONLL_03_SPANISH' | Spanish  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | 
 | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | 
+| 'JAPANESE_NER' | Japanese | [https://github.com/Hironsan/IOB2Corpus] Japanese NER dataset automatically generated from Wikipedia |
 | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
 | 'MIT_MOVIE_NER_SIMPLE' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
 | 'MIT_MOVIE_NER_COMPLEX' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
diff --git a/setup.py b/setup.py
index 1bf6d311a..5cfa85a62 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="flair",
-    version="0.8",
+    version="0.8.1",
     description="A very simple framework for state-of-the-art NLP",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
index 10ea99650..3de2c132d 100644
--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@@ -9,6 +9,7 @@
     FlairEmbeddings,
     DocumentRNNEmbeddings,
     DocumentLMEmbeddings, TransformerWordEmbeddings, TransformerDocumentEmbeddings,
+    DocumentCNNEmbeddings,
 )
 
 from flair.data import Sentence, Dictionary
@@ -287,4 +288,21 @@ def test_transformer_document_embeddings():
 
     sentence.clear_embeddings()
 
+    del embeddings
+    
+def test_document_cnn_embeddings():
+    sentence: Sentence = Sentence("I love Berlin. Berlin is a great place to live.")
+
+    embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(
+        [glove, flair_embedding], kernels=((50, 2), (50, 3))
+    )
+
+    embeddings.embed(sentence)
+
+    assert len(sentence.get_embedding()) == 100
+    assert len(sentence.get_embedding()) == embeddings.embedding_length
+
+    sentence.clear_embeddings()
+
+    assert len(sentence.get_embedding()) == 0
     del embeddings
\ No newline at end of file