From 86225d1e4595ab0da69888d01c7192250049db03 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 29 Sep 2021 09:02:57 +0200
Subject: [PATCH 1/8] GH-2454: store training parameters in model during
 training

---
 flair/nn/model.py         |  20 +++-
 flair/trainers/trainer.py | 245 +++++++++++++++++++-------------------
 2 files changed, 137 insertions(+), 128 deletions(-)
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 1a95044d8..73889a197 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -1,4 +1,3 @@
-import copy
 import itertools
 import logging
 import warnings
@@ -75,13 +74,16 @@ def _init_model_with_state_dict(state):
     def _fetch_model(model_name) -> str:
         return model_name
 
-    def save(self, model_file: Union[str, Path]):
+    def save(self, model_file: Union[str, Path], training_parameters: Optional[dict] = None):
         """
         Saves the current model to the provided file.
         :param model_file: the model file
         """
         model_state = self._get_state_dict()
 
+        if training_parameters:
+            model_state['training_parameters'] = training_parameters
+
         torch.save(model_state, str(model_file), pickle_protocol=4)
 
     @classmethod
@@ -102,11 +104,23 @@ def load(cls, model: Union[str, Path]):
 
         model = cls._init_model_with_state_dict(state)
 
+        if 'training_parameters' in state:
+            model.training_parameters = state['training_parameters']
+
         model.eval()
         model.to(flair.device)
 
         return model
 
+    def print_training_parameters(self):
+        if hasattr(self, 'training_parameters'):
+            param_out = "Training Parameters:\n"
+            for param in self.training_parameters:
+                param_out += f'{param} = {self.training_parameters[param]}\n'
+            log.info(param_out)
+        else:
+            log.info("Training Parameters not stored for this model")
+
 
 class Classifier(Model):
     """Abstract base class for all Flair models that do classification, both single- and multi-label.
@@ -175,7 +189,7 @@ def evaluate(
 
                     for gold_label in datapoint.get_labels(gold_label_type):
                         representation = str(sentence_id) + ': ' + gold_label.identifier
-                        
+
                         value = gold_label.value
                         if gold_label_dictionary and gold_label_dictionary.get_idx_for_item(value) == 0:
                             value = '<unk>'
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index efe543026..417ee679c 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -6,6 +6,7 @@
 import sys
 import time
 import warnings
+from inspect import signature
 from pathlib import Path
 from typing import Union, Tuple, Optional
 
@@ -54,7 +55,7 @@ def __init__(
         self.corpus: Corpus = corpus
 
     @staticmethod
-    def check_for_and_delete_previous_best_models(base_path, save_checkpoint):
+    def check_for_and_delete_previous_best_models(base_path):
         all_best_model_names = [filename for filename in os.listdir(base_path) if
                                 filename.startswith("best-model")]
         if len(all_best_model_names) != 0:
@@ -64,10 +65,15 @@ def check_for_and_delete_previous_best_models(base_path, save_checkpoint):
             previous_best_path = os.path.join(base_path, single_model)
             if os.path.exists(previous_best_path):
                 os.remove(previous_best_path)
-            if save_checkpoint:
-                best_checkpoint_path = previous_best_path.replace("model", "checkpoint")
-                if os.path.exists(best_checkpoint_path):
-                    os.remove(best_checkpoint_path)
+
+    def resume(self,
+               model,
+               **trainer_args,
+               ):
+
+        self.model = model
+
+        self.train(**self.model.training_parameters)
 
     def fine_tune(self,
                   base_path: Union[Path, str],
@@ -93,54 +99,52 @@ def fine_tune(self,
             **trainer_args,
         )
 
-    def train(
-            self,
-            base_path: Union[Path, str],
-            learning_rate: float = 0.1,
-            mini_batch_size: int = 32,
-            mini_batch_chunk_size: Optional[int] = None,
-            max_epochs: int = 100,
-            scheduler=AnnealOnPlateau,
-            cycle_momentum: bool = False,
-            anneal_factor: float = 0.5,
-            patience: int = 3,
-            initial_extra_patience: int = 0,
-            min_learning_rate: float = 0.0001,
-            warmup_fraction: float = 0.1,
-            train_with_dev: bool = False,
-            train_with_test: bool = False,
-            monitor_train: bool = False,
-            monitor_test: bool = False,
-            embeddings_storage_mode: str = "cpu",
-            checkpoint: bool = False,
-            save_final_model: bool = True,
-            anneal_with_restarts: bool = False,
-            anneal_with_prestarts: bool = False,
-            anneal_against_dev_loss: bool = False,
-            batch_growth_annealing: bool = False,
-            shuffle: bool = True,
-            param_selection_mode: bool = False,
-            write_weights: bool = False,
-            num_workers: int = 6,
-            sampler=None,
-            use_amp: bool = False,
-            amp_opt_level: str = "O1",
-            eval_on_train_fraction: float = 0.0,
-            eval_on_train_shuffle: bool = False,
-            save_model_each_k_epochs: int = 0,
-            main_evaluation_metric: Tuple[str, str] = ("micro avg", 'f1-score'),
-            tensorboard_comment: str = '',
-            save_best_checkpoints: bool = False,
-            use_swa: bool = False,
-            use_final_model_for_eval: bool = False,
-            gold_label_dictionary_for_eval: Optional[Dictionary] = None,
-            optimizer: torch.optim.Optimizer = SGD,
-            epoch: int = 0,
-            use_tensorboard: bool = False,
-            tensorboard_log_dir=None,
-            metrics_for_tensorboard=[],
-            **kwargs,
-    ) -> dict:
+    def train(self,
+              base_path: Union[Path, str],
+              learning_rate: float = 0.1,
+              mini_batch_size: int = 32,
+              mini_batch_chunk_size: Optional[int] = None,
+              max_epochs: int = 100,
+              train_with_dev: bool = False,
+              train_with_test: bool = False,
+              monitor_train: bool = False,
+              monitor_test: bool = False,
+              embeddings_storage_mode: str = "cpu",
+              scheduler=AnnealOnPlateau,
+              optimizer: torch.optim.Optimizer = SGD,
+              cycle_momentum: bool = False,
+              anneal_factor: float = 0.5,
+              patience: int = 3,
+              initial_extra_patience: int = 0,
+              min_learning_rate: float = 0.0001,
+              warmup_fraction: float = 0.1,
+              save_final_model: bool = True,
+              anneal_with_restarts: bool = False,
+              anneal_with_prestarts: bool = False,
+              anneal_against_dev_loss: bool = False,
+              batch_growth_annealing: bool = False,
+              shuffle: bool = True,
+              param_selection_mode: bool = False,
+              write_weights: bool = False,
+              num_workers: int = 6,
+              sampler=None,
+              use_amp: bool = False,
+              amp_opt_level: str = "O1",
+              eval_on_train_fraction: float = 0.0,
+              eval_on_train_shuffle: bool = False,
+              save_model_each_k_epochs: int = 0,
+              main_evaluation_metric: Tuple[str, str] = ("micro avg", 'f1-score'),
+              tensorboard_comment: str = '',
+              checkpoint: bool = False,
+              use_swa: bool = False,
+              use_final_model_for_eval: bool = False,
+              gold_label_dictionary_for_eval: Optional[Dictionary] = None,
+              epoch: int = 0,
+              use_tensorboard: bool = False,
+              tensorboard_log_dir=None,
+              metrics_for_tensorboard=[],
+              **kwargs,
+              ) -> dict:
         """
         Trains any class that implements the flair.nn.Model interface.
         :param base_path: Main path to which all output during training is logged and models are saved
@@ -160,7 +164,6 @@ def train(
         :param monitor_test: If True, test data is evaluated at end of each epoch
         :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
         'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
-        :param checkpoint: If True, a full checkpoint is saved at end of each epoch
         :param save_final_model: If True, final model is saved
         :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate
         :param shuffle: If True, data is shuffled during training
@@ -178,7 +181,6 @@ def train(
         :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved
         :param classification_main_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model
         :param tensorboard_comment: Comment to use for tensorboard logging
-        :param save_best_checkpoints: If True, in addition to saving the best model also the corresponding checkpoint is saved
         :param optimizer: The optimizer to use (typically SGD or Adam)
         :param epoch: The starting epoch (normally 0 but could be higher if you continue training model)
         :param use_tensorboard: If True, writes out tensorboard information
@@ -189,6 +191,13 @@ def train(
         :return:
         """
 
+        # remember training parameters
+        training_parameters = {}
+        local_variables = locals()
+        for parameter in signature(self.train).parameters:
+            training_parameters[parameter] = local_variables[parameter]
+        self.model.training_parameters = training_parameters
+
         if use_tensorboard:
             try:
                 from torch.utils.tensorboard import SummaryWriter
@@ -200,9 +209,7 @@ def train(
 
             except:
                 log_line(log)
-                log.warning(
-                    "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!"
-                )
+                log.warning("ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!")
                 log_line(log)
                 use_tensorboard = False
                 pass
@@ -254,7 +261,7 @@ def train(
             log.warning(f'WARNING: Specified class weights will not take effect when using CRF')
 
         # check for previously saved best models in the current training folder and delete them
-        self.check_for_and_delete_previous_best_models(base_path, save_best_checkpoints)
+        self.check_for_and_delete_previous_best_models(base_path)
 
         # determine what splits (train, dev, test) to evaluate and log
         log_train = True if monitor_train else False
@@ -280,7 +287,9 @@ def train(
 
         weight_extractor = WeightExtractor(base_path)
 
-        optimizer: torch.optim.Optimizer = optimizer(self.model.parameters(), lr=learning_rate, **kwargs)
+        # if optimizer class is passed, instantiate:
+        if inspect.isclass(optimizer):
+            optimizer: torch.optim.Optimizer = optimizer(self.model.parameters(), lr=learning_rate, **kwargs)
 
         if use_swa:
             import torchcontrib
@@ -299,33 +308,35 @@ def train(
         if train_with_dev:
             dataset_size += len(self.corpus.dev)
 
-        if scheduler == OneCycleLR:
-            lr_scheduler = OneCycleLR(optimizer,
-                                      max_lr=learning_rate,
-                                      steps_per_epoch=dataset_size // mini_batch_size + 1,
-                                      epochs=max_epochs - epoch,
-                                      # if we load a checkpoint, we have already trained for epoch
-                                      pct_start=0.0,
-                                      cycle_momentum=cycle_momentum)
-        elif scheduler == LinearSchedulerWithWarmup:
-            steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size
-            num_train_steps = int(steps_per_epoch * max_epochs)
-            num_warmup_steps = int(num_train_steps * warmup_fraction)
-
-            lr_scheduler = LinearSchedulerWithWarmup(optimizer,
-                                                     num_train_steps=num_train_steps,
-                                                     num_warmup_steps=num_warmup_steps)
-        else:
-            lr_scheduler = scheduler(
-                optimizer,
-                factor=anneal_factor,
-                patience=patience,
-                initial_extra_patience=initial_extra_patience,
-                mode=anneal_mode,
-                verbose=True,
-            )
+        # if scheduler is passed as a class, instantiate
+        if inspect.isclass(scheduler):
+            if scheduler == OneCycleLR:
+                scheduler = OneCycleLR(optimizer,
+                                       max_lr=learning_rate,
+                                       steps_per_epoch=dataset_size // mini_batch_size + 1,
+                                       epochs=max_epochs - epoch,
+                                       # if we load a checkpoint, we have already trained for epoch
+                                       pct_start=0.0,
+                                       cycle_momentum=cycle_momentum)
+            elif scheduler == LinearSchedulerWithWarmup:
+                steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size
+                num_train_steps = int(steps_per_epoch * max_epochs)
+                num_warmup_steps = int(num_train_steps * warmup_fraction)
+
+                scheduler = LinearSchedulerWithWarmup(optimizer,
+                                                      num_train_steps=num_train_steps,
+                                                      num_warmup_steps=num_warmup_steps)
+            else:
+                scheduler = scheduler(
+                    optimizer,
+                    factor=anneal_factor,
+                    patience=patience,
+                    initial_extra_patience=initial_extra_patience,
+                    mode=anneal_mode,
+                    verbose=True,
+                )
 
-        if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing):
+        if isinstance(scheduler, OneCycleLR) and batch_growth_annealing:
             raise ValueError("Batch growth with OneCycle policy is not implemented.")
 
         train_data = self.corpus.train
@@ -365,6 +376,9 @@ def train(
             for epoch in range(epoch + 1, max_epochs + 1):
                 log_line(log)
 
+                # update training parameter
+                self.model.training_parameters['epoch'] = epoch
+
                 if anneal_with_prestarts:
                     last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict())
 
@@ -405,7 +419,7 @@ def train(
                     writer.add_scalar("learning_rate", learning_rate, epoch)
 
                 # stop training if learning rate becomes too small
-                if ((not isinstance(lr_scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and
+                if ((not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and
                      learning_rate < min_learning_rate)):
                     log_line(log)
                     log.info("learning rate too small - quitting training!")
@@ -470,8 +484,8 @@ def train(
                     optimizer.step()
 
                     # do the scheduler step if one-cycle or linear decay
-                    if isinstance(lr_scheduler, (OneCycleLR, LinearSchedulerWithWarmup)):
-                        lr_scheduler.step()
+                    if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)):
+                        scheduler.step()
                         # get new learning rate
                         for group in optimizer.param_groups:
                             learning_rate = group["lr"]
@@ -505,9 +519,7 @@ def train(
                 self.model.eval()
 
                 log_line(log)
-                log.info(
-                    f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}"
-                )
+                log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}")
 
                 if use_tensorboard:
                     writer.add_scalar("train_loss", train_loss, epoch)
@@ -540,9 +552,8 @@ def train(
                         main_evaluation_metric=main_evaluation_metric,
                         gold_label_dictionary=gold_label_dictionary_for_eval,
                     )
-                    result_line += (
-                        f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
-                    )
+                    result_line += f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
+
                     log.info(
                         f"TRAIN_SPLIT : loss {train_part_loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]}) {round(train_part_eval_result.main_score, 4)}"
                     )
@@ -580,9 +591,7 @@ def train(
 
                     if use_tensorboard:
                         writer.add_scalar("dev_loss", dev_eval_result.loss, epoch)
-                        writer.add_scalar(
-                            "dev_score", dev_eval_result.main_score, epoch
-                        )
+                        writer.add_scalar("dev_score", dev_eval_result.main_score, epoch)
                         for (metric_class_avg_type, metric_type) in metrics_for_tensorboard:
                             writer.add_scalar(
                                 f"dev_{metric_class_avg_type}_{metric_type}",
@@ -610,9 +619,7 @@ def train(
 
                     if use_tensorboard:
                         writer.add_scalar("test_loss", test_eval_result.loss, epoch)
-                        writer.add_scalar(
-                            "test_score", test_eval_result.main_score, epoch
-                        )
+                        writer.add_scalar("test_score", test_eval_result.main_score, epoch)
                         for (metric_class_avg_type, metric_type) in metrics_for_tensorboard:
                             writer.add_scalar(
                                 f"test_{metric_class_avg_type}_{metric_type}",
@@ -627,8 +634,8 @@ def train(
                         current_epoch_has_best_model_so_far = True
                         best_validation_score = dev_score
 
-                    if isinstance(lr_scheduler, AnnealOnPlateau):
-                        lr_scheduler.step(dev_score, dev_eval_result.loss)
+                    if isinstance(scheduler, AnnealOnPlateau):
+                        scheduler.step(dev_score, dev_eval_result.loss)
 
                 # alternative: anneal against dev loss
                 if not train_with_dev and anneal_against_dev_loss:
@@ -636,8 +643,8 @@ def train(
                         current_epoch_has_best_model_so_far = True
                         best_validation_score = dev_eval_result.loss
 
-                    if isinstance(lr_scheduler, AnnealOnPlateau):
-                        lr_scheduler.step(dev_eval_result.loss)
+                    if isinstance(scheduler, AnnealOnPlateau):
+                        scheduler.step(dev_eval_result.loss)
 
                 # alternative: anneal against train loss
                 if train_with_dev:
@@ -645,14 +652,14 @@ def train(
                         current_epoch_has_best_model_so_far = True
                         best_validation_score = train_loss
 
-                    if isinstance(lr_scheduler, AnnealOnPlateau):
-                        lr_scheduler.step(train_loss)
+                    if isinstance(scheduler, AnnealOnPlateau):
+                        scheduler.step(train_loss)
 
                 train_loss_history.append(train_loss)
 
                 # determine bad epoch number
                 try:
-                    bad_epochs = lr_scheduler.num_bad_epochs
+                    bad_epochs = scheduler.num_bad_epochs
                 except:
                     bad_epochs = 0
                 for group in optimizer.param_groups:
@@ -691,7 +698,7 @@ def train(
 
                 # if checkpoint is enabled, save model at each epoch
                 if checkpoint and not param_selection_mode:
-                    self.save_checkpoint(base_path / "checkpoint.pt")
+                    self.model.save(base_path / "checkpoint.pt")
 
                 # Check whether to save best model
                 if (
@@ -701,7 +708,7 @@ def train(
                         and not use_final_model_for_eval
                 ):
                     log.info("saving best model")
-                    self.model.save(base_path / "best-model.pt")
+                    self.model.save(base_path / "best-model.pt", training_parameters=training_parameters)
 
                     if anneal_with_prestarts:
                         current_state_dict = self.model.state_dict()
@@ -712,14 +719,14 @@ def train(
                 if save_model_each_k_epochs > 0 and not epoch % save_model_each_k_epochs:
                     print("saving model of current epoch")
                     model_name = "model_epoch_" + str(epoch) + ".pt"
-                    self.model.save(base_path / model_name)
+                    self.model.save(base_path / model_name, training_parameters=training_parameters)
 
             if use_swa:
                 optimizer.swap_swa_sgd()
 
             # if we do not use dev data for model selection, save final model
             if save_final_model and not param_selection_mode:
-                self.model.save(base_path / "final-model.pt")
+                self.model.save(base_path / "final-model.pt", training_parameters=training_parameters)
 
         except KeyboardInterrupt:
             log_line(log)
@@ -730,7 +737,7 @@ def train(
 
             if not param_selection_mode:
                 log.info("Saving model ...")
-                self.model.save(base_path / "final-model.pt")
+                self.model.save(base_path / "final-model.pt", training_parameters=training_parameters)
                 log.info("Done.")
 
         # test best model if test data is present
@@ -760,18 +767,6 @@ def train(
             "dev_loss_history": dev_loss_history,
         }
 
-    def save_checkpoint(self, model_file: Union[str, Path]):
-        corpus = self.corpus
-        self.corpus = None
-        torch.save(self, str(model_file), pickle_protocol=4)
-        self.corpus = corpus
-
-    @classmethod
-    def load_checkpoint(cls, checkpoint: Union[Path, str], corpus: Corpus):
-        model: ModelTrainer = torch.load(checkpoint, map_location=flair.device)
-        model.corpus = corpus
-        return model
-
     def final_test(
             self,
             base_path: Union[Path, str],

From a96172268c34aa3dc0571ce31037b52d2e7a270d Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 29 Sep 2021 09:51:17 +0200
Subject: [PATCH 2/8] GH-2454: add Flair, PyTorch and Transformers version to
 automatic model card

---
 flair/nn/model.py         | 34 ++++++++++++++++++++++++----------
 flair/trainers/trainer.py | 31 ++++++++++++++++++++++---------
 2 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index 73889a197..7355c5f1a 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -74,15 +74,15 @@ def _init_model_with_state_dict(state):
     def _fetch_model(model_name) -> str:
         return model_name
 
-    def save(self, model_file: Union[str, Path], training_parameters: Optional[dict] = None):
+    def save(self, model_file: Union[str, Path]):
         """
         Saves the current model to the provided file.
         :param model_file: the model file
         """
         model_state = self._get_state_dict()
 
-        if training_parameters:
-            model_state['training_parameters'] = training_parameters
+        if hasattr(self, 'model_card'):
+            model_state['model_card'] = self.model_card
 
         torch.save(model_state, str(model_file), pickle_protocol=4)
 
@@ -104,8 +104,8 @@ def load(cls, model: Union[str, Path]):
 
         model = cls._init_model_with_state_dict(state)
 
-        if 'training_parameters' in state:
-            model.training_parameters = state['training_parameters']
+        if 'model_card' in state:
+            model.model_card = state['model_card']
 
         model.eval()
         model.to(flair.device)
@@ -113,13 +113,27 @@ def load(cls, model: Union[str, Path]):
         return model
 
     def print_training_parameters(self):
-        if hasattr(self, 'training_parameters'):
-            param_out = "Training Parameters:\n"
-            for param in self.training_parameters:
-                param_out += f'{param} = {self.training_parameters[param]}\n'
+        if hasattr(self, 'model_card'):
+            param_out = "\n------------------------------------\n"
+            param_out += "--------- Flair Model Card ---------\n"
+            param_out += "------------------------------------\n"
+            param_out += "- this Flair model was trained with:\n"
+            param_out += f"-- Flair version {self.model_card['flair_version']}\n"
+            param_out += f"-- PyTorch version {self.model_card['pytorch_version']}\n"
+            if 'transformers_version' in self.model_card:
+                param_out += f"-- Transformers version {self.model_card['transformers_version']}\n"
+            param_out += "------------------------------------\n"
+
+            param_out += "------- Training Parameters: -------\n"
+            param_out += "------------------------------------\n"
+            training_params = '\n'.join(f'-- {param} = {self.model_card["training_parameters"][param]}'
+                                       for param in self.model_card['training_parameters'] )
+            param_out += training_params + "\n"
+            param_out += "------------------------------------\n"
+
             log.info(param_out)
         else:
-            log.info("Training Parameters not stored for this model")
+            log.info("This model has no model card (likely because it was trained with Flair version < 0.9.1)")
 
 
 class Classifier(Model):
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 315bfd69a..4f8c4217a 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -73,7 +73,7 @@ def resume(self,
 
         self.model = model
 
-        self.train(**self.model.training_parameters)
+        self.train(**self.model.model_card)
 
     def fine_tune(self,
                   base_path: Union[Path, str],
@@ -197,12 +197,25 @@ def train(
         :return:
         """
 
-        # remember training parameters
-        training_parameters = {}
+        # create a model card for this model with Flair and PyTorch version
+        model_card = {'flair_version': flair.__version__, 'pytorch_version': torch.__version__}
+
+        # also record Transformers version if library is loaded
+        try:
+            import transformers
+            model_card['transformers_version'] = transformers.__version__
+        except:
+            pass
+
+        # remember all parameters used in train() call
         local_variables = locals()
+        training_parameters = {}
         for parameter in signature(self.train).parameters:
             training_parameters[parameter] = local_variables[parameter]
-        self.model.training_parameters = training_parameters
+        model_card['training_parameters'] = training_parameters
+
+        # add model card to model
+        self.model.model_card = model_card
 
         if use_tensorboard:
             try:
@@ -390,7 +403,7 @@ def train(
                 log_line(log)
 
                 # update training parameter
-                self.model.training_parameters['epoch'] = epoch
+                self.model.model_card['training_parameters']['epoch'] = epoch
 
                 if anneal_with_prestarts:
                     last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict())
@@ -724,7 +737,7 @@ def train(
                         and not use_final_model_for_eval
                 ):
                     log.info("saving best model")
-                    self.model.save(base_path / "best-model.pt", training_parameters=training_parameters)
+                    self.model.save(base_path / "best-model.pt")
 
                     if anneal_with_prestarts:
                         current_state_dict = self.model.state_dict()
@@ -735,14 +748,14 @@ def train(
                 if save_model_each_k_epochs > 0 and not epoch % save_model_each_k_epochs:
                     print("saving model of current epoch")
                     model_name = "model_epoch_" + str(epoch) + ".pt"
-                    self.model.save(base_path / model_name, training_parameters=training_parameters)
+                    self.model.save(base_path / model_name)
 
             if use_swa:
                 optimizer.swap_swa_sgd()
 
             # if we do not use dev data for model selection, save final model
             if save_final_model and not param_selection_mode:
-                self.model.save(base_path / "final-model.pt", training_parameters=training_parameters)
+                self.model.save(base_path / "final-model.pt")
 
         except KeyboardInterrupt:
             log_line(log)
@@ -753,7 +766,7 @@ def train(
 
             if not param_selection_mode:
                 log.info("Saving model ...")
-                self.model.save(base_path / "final-model.pt", training_parameters=training_parameters)
+                self.model.save(base_path / "final-model.pt")
                 log.info("Done.")
 
         # test best model if test data is present

From 8d7a2cc7cb9aa86a9648a2196acc9cbd6fc24505 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Thu, 30 Sep 2021 22:24:29 +0200
Subject: [PATCH 3/8] GH-2454: add optimizer and scheduler to model card and
 enable resumption of training

---
 flair/models/sequence_tagger_model.py |  11 +--
 flair/nn/model.py                     |  35 ++++++-
 flair/trainers/trainer.py             | 134 +++++++++++++++-----------
 3 files changed, 116 insertions(+), 64 deletions(-)

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index fcf21d5cf..65375b284 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -254,14 +254,9 @@ def _init_model_with_state_dict(state):
 
         rnn_type = "LSTM" if "rnn_type" not in state.keys() else state["rnn_type"]
         use_dropout = 0.0 if "use_dropout" not in state.keys() else state["use_dropout"]
-        use_word_dropout = (
-            0.0 if "use_word_dropout" not in state.keys() else state["use_word_dropout"]
-        )
-        use_locked_dropout = (
-            0.0
-            if "use_locked_dropout" not in state.keys()
-            else state["use_locked_dropout"]
-        )
+        use_word_dropout = 0.0 if "use_word_dropout" not in state.keys() else state["use_word_dropout"]
+        use_locked_dropout = 0.0 if "use_locked_dropout" not in state.keys() else state["use_locked_dropout"]
+
         train_initial_hidden_state = (
             False
             if "train_initial_hidden_state" not in state.keys()
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 7355c5f1a..870fa139b 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -81,11 +81,41 @@ def save(self, model_file: Union[str, Path]):
         """
         model_state = self._get_state_dict()
 
+        # in Flair <0.9.1, optimizer and scheduler used to train model are not saved
+        optimizer = scheduler = None
+
+        # write out a "model card" if one is set
         if hasattr(self, 'model_card'):
+
+            # special handling for optimizer: remember optimizer class and state dictionary
+            if 'training_parameters' in self.model_card:
+                training_parameters = self.model_card['training_parameters']
+
+                if 'optimizer' in training_parameters:
+                    optimizer = training_parameters['optimizer']
+                    self.model_card['optimizer_class'] = optimizer.__class__
+                    training_parameters['optimizer_state_dict'] = optimizer.state_dict()
+
+                if 'scheduler' in training_parameters:
+                    scheduler = training_parameters['scheduler']
+                    self.model_card['scheduler_class'] = scheduler.__class__
+                    training_parameters['scheduler_state_dict'] = scheduler.state_dict()
+
             model_state['model_card'] = self.model_card
 
+            # delete optimizer and scheduler instance since this may cause serialization errors
+            del model_state['model_card']['training_parameters']['optimizer']
+            del model_state['model_card']['training_parameters']['scheduler']
+
+        # save model
         torch.save(model_state, str(model_file), pickle_protocol=4)
 
+        # restore optimizer and scheduler to model card if set
+        if optimizer:
+            self.model_card['training_parameters']['optimizer'] = optimizer
+        if scheduler:
+            self.model_card['training_parameters']['scheduler'] = scheduler
+
     @classmethod
     def load(cls, model: Union[str, Path]):
         """
@@ -127,13 +157,14 @@ def print_training_parameters(self):
             param_out += "------- Training Parameters: -------\n"
             param_out += "------------------------------------\n"
             training_params = '\n'.join(f'-- {param} = {self.model_card["training_parameters"][param]}'
-                                       for param in self.model_card['training_parameters'] )
+                                        for param in self.model_card['training_parameters'])
             param_out += training_params + "\n"
             param_out += "------------------------------------\n"
 
             log.info(param_out)
         else:
-            log.info("This model has no model card (likely because it was trained with Flair version < 0.9.1)")
+            log.info(
+                "This model has no model card (likely because it is not yet trained or was trained with Flair version < 0.9.1)")
 
 
 class Classifier(Model):
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 4f8c4217a..8a3bbc1cf 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -14,6 +14,8 @@
 from torch.optim.sgd import SGD
 from torch.utils.data.dataset import ConcatDataset
 
+from flair.nn import Model
+
 try:
     from apex import amp
 except ImportError:
@@ -60,45 +62,13 @@ def check_for_and_delete_previous_best_models(base_path):
                                 filename.startswith("best-model")]
         if len(all_best_model_names) != 0:
             warnings.warn(
-                "There should be no best model saved at epoch 1 except there is a model from previous trainings in your training folder. All previous best models will be deleted.")
+                "There should be no best model saved at epoch 1 except there is a model from previous trainings"
+                " in your training folder. All previous best models will be deleted.")
         for single_model in all_best_model_names:
             previous_best_path = os.path.join(base_path, single_model)
             if os.path.exists(previous_best_path):
                 os.remove(previous_best_path)
 
-    def resume(self,
-               model,
-               **trainer_args,
-               ):
-
-        self.model = model
-
-        self.train(**self.model.model_card)
-
-    def fine_tune(self,
-                  base_path: Union[Path, str],
-                  learning_rate: float = 5e-5,
-                  max_epochs: int = 10,
-                  optimizer=torch.optim.AdamW,
-                  scheduler=LinearSchedulerWithWarmup,
-                  warmup_fraction: float = 0.1,
-                  mini_batch_size: int = 4,
-                  embeddings_storage_mode: str = 'none',
-                  **trainer_args,
-                  ):
-
-        return self.train(
-            base_path=base_path,
-            learning_rate=learning_rate,
-            max_epochs=max_epochs,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            warmup_fraction=warmup_fraction,
-            mini_batch_size=mini_batch_size,
-            embeddings_storage_mode=embeddings_storage_mode,
-            **trainer_args,
-        )
-
     def train(
             self,
             base_path: Union[Path, str],
@@ -147,6 +117,8 @@ def train(
             use_tensorboard: bool = False,
             tensorboard_log_dir=None,
             metrics_for_tensorboard=[],
+            optimizer_state_dict: Optional = None,
+            scheduler_state_dict: Optional = None,
             **kwargs,
     ) -> dict:
         """
@@ -293,23 +265,16 @@ def train(
         log_train_part = True if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) else False
 
         if log_train_part:
-            train_part_size = (
-                len(self.corpus.dev)
-                if eval_on_train_fraction == "dev"
+            train_part_size = len(self.corpus.dev) if eval_on_train_fraction == "dev" \
                 else int(len(self.corpus.train) * eval_on_train_fraction)
-            )
+
             assert train_part_size > 0
             if not eval_on_train_shuffle:
                 train_part_indices = list(range(train_part_size))
-                train_part = torch.utils.data.dataset.Subset(
-                    self.corpus.train, train_part_indices
-                )
+                train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices)
 
-        if create_loss_file:
-            # prepare loss logging file and set up header
-            loss_txt = init_output_file(base_path, "loss.tsv")
-        else:
-            loss_txt = None
+        # prepare loss logging file and set up header
+        loss_txt = init_output_file(base_path, "loss.tsv") if create_loss_file else None
 
         weight_extractor = WeightExtractor(base_path)
 
@@ -326,6 +291,10 @@ def train(
                 self.model, optimizer, opt_level=amp_opt_level
             )
 
+        # load existing optimizer state dictionary if it exists
+        if optimizer_state_dict:
+            optimizer.load_state_dict(optimizer_state_dict)
+
         # minimize training loss if training with dev data, else maximize dev score
         anneal_mode = "min" if train_with_dev or anneal_against_dev_loss else "max"
         best_validation_score = 100000000000 if train_with_dev or anneal_against_dev_loss else 0.
@@ -362,6 +331,14 @@ def train(
                     verbose=True,
                 )
 
+        # load existing scheduler state dictionary if it exists
+        if scheduler_state_dict:
+            scheduler.load_state_dict(scheduler_state_dict)
+
+        # update optimizer and scheduler in model card
+        model_card['training_parameters']['optimizer'] = optimizer
+        model_card['training_parameters']['scheduler'] = scheduler
+
         if isinstance(scheduler, OneCycleLR) and batch_growth_annealing:
             raise ValueError("Batch growth with OneCycle policy is not implemented.")
 
@@ -402,7 +379,7 @@ def train(
             for epoch in range(epoch + 1, max_epochs + 1):
                 log_line(log)
 
-                # update training parameter
+                # update epoch in model card
                 self.model.model_card['training_parameters']['epoch'] = epoch
 
                 if anneal_with_prestarts:
@@ -483,16 +460,14 @@ def train(
                     # if necessary, make batch_steps
                     batch_steps = [batch]
                     if len(batch) > micro_batch_size:
-                        batch_steps = [
-                            batch[x: x + micro_batch_size]
-                            for x in range(0, len(batch), micro_batch_size)
-                        ]
+                        batch_steps = [batch[x: x + micro_batch_size] for x in range(0, len(batch), micro_batch_size)]
 
                     # forward and backward for batch
                     for batch_step in batch_steps:
 
                         # forward pass
                         loss = self.model.forward_loss(batch_step)
+
                         if isinstance(loss, Tuple):
                             average_over += loss[1]
                             loss = loss[0]
@@ -537,9 +512,7 @@ def train(
                         batch_time = 0
                         iteration = epoch * total_number_of_batches + batch_no
                         if not param_selection_mode and write_weights:
-                            weight_extractor.extract_weights(
-                                self.model.state_dict(), iteration
-                            )
+                            weight_extractor.extract_weights(self.model.state_dict(), iteration)
 
                 if average_over != 0:
                     train_loss /= average_over
@@ -796,6 +769,59 @@ def train(
             "dev_loss_history": dev_loss_history,
         }
 
+    def resume(self,
+               model: Optional[Model],
+               **trainer_args,
+               ):
+
+        self.model = model
+
+        # recover all arguments that were used to train this model
+        args_used_to_train_model = self.model.model_card['training_parameters']
+
+        # only pass the class of the optimizer, not the instance
+        args_used_to_train_model['optimizer'] = self.model.model_card['optimizer_class']
+        args_used_to_train_model['scheduler'] = self.model.model_card['scheduler_class']
+
+        # you can overwrite params with your own
+        for param in trainer_args:
+            args_used_to_train_model[param] = trainer_args[param]
+            if param == 'optimizer' and 'optimizer_state_dict' in args_used_to_train_model:
+                del args_used_to_train_model['optimizer_state_dict']
+            if param == 'scheduler' and 'scheduler_state_dict' in args_used_to_train_model:
+                del args_used_to_train_model['scheduler_state_dict']
+
+        # surface nested arguments
+        kwargs = args_used_to_train_model['kwargs']
+        del args_used_to_train_model['kwargs']
+
+        # resume training with these parameters
+        self.train(**args_used_to_train_model, **kwargs)
+
+    def fine_tune(self,
+                  base_path: Union[Path, str],
+                  learning_rate: float = 5e-5,
+                  max_epochs: int = 10,
+                  optimizer=torch.optim.AdamW,
+                  scheduler=LinearSchedulerWithWarmup,
+                  warmup_fraction: float = 0.1,
+                  mini_batch_size: int = 4,
+                  embeddings_storage_mode: str = 'none',
+                  **trainer_args,
+                  ):
+
+        return self.train(
+            base_path=base_path,
+            learning_rate=learning_rate,
+            max_epochs=max_epochs,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            warmup_fraction=warmup_fraction,
+            mini_batch_size=mini_batch_size,
+            embeddings_storage_mode=embeddings_storage_mode,
+            **trainer_args,
+        )
+
     def final_test(
             self,
             base_path: Union[Path, str],

From 6e5b5c34fb10eeab5648cb48524b8e43bf4c6f5e Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Thu, 30 Sep 2021 22:26:06 +0200
Subject: [PATCH 4/8] GH-2454: rename method to print_model_card

---
 flair/nn/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index 870fa139b..6eae7c4bf 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -142,7 +142,7 @@ def load(cls, model: Union[str, Path]):
 
         return model
 
-    def print_training_parameters(self):
+    def print_model_card(self):
         if hasattr(self, 'model_card'):
             param_out = "\n------------------------------------\n"
             param_out += "--------- Flair Model Card ---------\n"

From 0c7fdb40471497cdc765d88ea7d947f89b845fcc Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Thu, 30 Sep 2021 22:42:09 +0200
Subject: [PATCH 5/8] GH-2454: more comments

---
 flair/trainers/trainer.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 8a3bbc1cf..3d0117103 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -76,17 +76,19 @@ def train(
             mini_batch_size: int = 32,
             mini_batch_chunk_size: Optional[int] = None,
             max_epochs: int = 100,
+            train_with_dev: bool = False,
+            train_with_test: bool = False,
+            monitor_train: bool = False,
+            monitor_test: bool = False,
+            main_evaluation_metric: Tuple[str, str] = ("micro avg", 'f1-score'),
             scheduler=AnnealOnPlateau,
-            cycle_momentum: bool = False,
             anneal_factor: float = 0.5,
             patience: int = 3,
-            initial_extra_patience: int = 0,
             min_learning_rate: float = 0.0001,
+            initial_extra_patience: int = 0,
+            optimizer: torch.optim.Optimizer = SGD,
+            cycle_momentum: bool = False,
             warmup_fraction: float = 0.1,
-            train_with_dev: bool = False,
-            train_with_test: bool = False,
-            monitor_train: bool = False,
-            monitor_test: bool = False,
             embeddings_storage_mode: str = "cpu",
             checkpoint: bool = False,
             save_final_model: bool = True,
@@ -104,15 +106,12 @@ def train(
             eval_on_train_fraction: float = 0.0,
             eval_on_train_shuffle: bool = False,
             save_model_each_k_epochs: int = 0,
-            main_evaluation_metric: Tuple[str, str] = ("micro avg", 'f1-score'),
             tensorboard_comment: str = '',
-            save_best_checkpoints: bool = False,
             use_swa: bool = False,
             use_final_model_for_eval: bool = False,
             gold_label_dictionary_for_eval: Optional[Dictionary] = None,
             create_file_logs: bool = True,
             create_loss_file: bool = True,
-            optimizer: torch.optim.Optimizer = SGD,
             epoch: int = 0,
             use_tensorboard: bool = False,
             tensorboard_log_dir=None,
@@ -129,13 +128,15 @@ def train(
         :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes
         :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
         :param scheduler: The learning rate scheduler to use
+        :param checkpoint: If True, a full checkpoint is saved at end of each epoch
         :param cycle_momentum: If scheduler is OneCycleLR, whether the scheduler should cycle also the momentum
         :param anneal_factor: The factor by which the learning rate is annealed
         :param patience: Patience is the number of epochs with no improvement the Trainer waits
          until annealing the learning rate
         :param min_learning_rate: If the learning rate falls below this threshold, training terminates
         :param warmup_fraction: Fraction of warmup steps if the scheduler is LinearSchedulerWithWarmup
-        :param train_with_dev: If True, training is performed using both train+dev data
+        :param train_with_dev:  If True, the data from dev split is added to the training data
+        :param train_with_test: If True, the data from test split is added to the training data
         :param monitor_train: If True, training data is evaluated at end of each epoch
         :param monitor_test: If True, test data is evaluated at end of each epoch
         :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
@@ -154,10 +155,8 @@ def train(
         and kept fixed during training, otherwise it's sampled at beginning of each epoch
         :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
         be saved each 5 epochs. Default is 0 which means no model saving.
-        :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved
-        :param classification_main_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model
+        :param main_evaluation_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model
         :param tensorboard_comment: Comment to use for tensorboard logging
-        :param save_best_checkpoints: If True, in addition to saving the best model also the corresponding checkpoint is saved
         :param create_file_logs: If True, the logs will also be stored in a file 'training.log' in the model folder
         :param create_loss_file: If True, the loss will be writen to a file 'loss.tsv' in the model folder
         :param optimizer: The optimizer to use (typically SGD or Adam)
@@ -389,9 +388,7 @@ def train(
                     train_part_indices = list(range(self.corpus.train))
                     random.shuffle(train_part_indices)
                     train_part_indices = train_part_indices[:train_part_size]
-                    train_part = torch.utils.data.dataset.Subset(
-                        self.corpus.train, train_part_indices
-                    )
+                    train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices)
 
                 # get new learning rate
                 for group in optimizer.param_groups:

From f565d16661ec512dd467ed046fdac8c6e011e32d Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Thu, 30 Sep 2021 23:01:06 +0200
Subject: [PATCH 6/8] GH-2454: scheduler and optimizer serialization

---
 flair/nn/model.py         | 10 +++++-----
 flair/trainers/trainer.py |  4 ----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index 6eae7c4bf..e8f317c46 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -93,19 +93,19 @@ def save(self, model_file: Union[str, Path]):
 
                 if 'optimizer' in training_parameters:
                     optimizer = training_parameters['optimizer']
-                    self.model_card['optimizer_class'] = optimizer.__class__
                     training_parameters['optimizer_state_dict'] = optimizer.state_dict()
+                    training_parameters['optimizer'] = optimizer.__class__
 
                 if 'scheduler' in training_parameters:
                     scheduler = training_parameters['scheduler']
-                    self.model_card['scheduler_class'] = scheduler.__class__
                     training_parameters['scheduler_state_dict'] = scheduler.state_dict()
+                    training_parameters['scheduler'] = scheduler.__class__
 
             model_state['model_card'] = self.model_card
 
-            # delete optimizer and scheduler instance since this may cause serialization errors
-            del model_state['model_card']['training_parameters']['optimizer']
-            del model_state['model_card']['training_parameters']['scheduler']
+            # # delete optimizer and scheduler instance since this may cause serialization errors
+            # del model_state['model_card']['training_parameters']['optimizer']
+            # del model_state['model_card']['training_parameters']['scheduler']
 
         # save model
         torch.save(model_state, str(model_file), pickle_protocol=4)
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 3d0117103..34f0e00e9 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -776,10 +776,6 @@ def resume(self,
         # recover all arguments that were used to train this model
         args_used_to_train_model = self.model.model_card['training_parameters']
 
-        # only pass the class of the optimizer, not the instance
-        args_used_to_train_model['optimizer'] = self.model.model_card['optimizer_class']
-        args_used_to_train_model['scheduler'] = self.model.model_card['scheduler_class']
-
         # you can overwrite params with your own
         for param in trainer_args:
             args_used_to_train_model[param] = trainer_args[param]

From 6c4a55cfb07f1ed62c158a6c497c0a933b0df48a Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Thu, 30 Sep 2021 23:02:00 +0200
Subject: [PATCH 7/8] GH-2454: scheduler and optimizer serialization

---
 flair/nn/model.py       | 4 ----
 flair/training_utils.py | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index e8f317c46..007dde46d 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -103,10 +103,6 @@ def save(self, model_file: Union[str, Path]):
 
             model_state['model_card'] = self.model_card
 
-            # # delete optimizer and scheduler instance since this may cause serialization errors
-            # del model_state['model_card']['training_parameters']['optimizer']
-            # del model_state['model_card']['training_parameters']['scheduler']
-
         # save model
         torch.save(model_state, str(model_file), pickle_protocol=4)
 
diff --git a/flair/training_utils.py b/flair/training_utils.py
index 7c82b22e1..138283709 100644
--- a/flair/training_utils.py
+++ b/flair/training_utils.py
@@ -315,7 +315,7 @@ def state_dict(self):
 
     def load_state_dict(self, state_dict):
         self.__dict__.update(state_dict)
-        self._init_is_better(mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode)
+        self._init_is_better(mode=self.mode)
 
 
 def init_output_file(base_path: Union[str, Path], file_name: str) -> Path:

From 7343c56ede43d8074c43a418655b0cb3508a7b21 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 1 Oct 2021 09:01:00 +0200
Subject: [PATCH 8/8] GH-2454: fix unit tests

---
 tests/test_sequence_tagger.py | 10 ++++---
 tests/test_text_classifier.py | 50 ++++++-----------------------------
 2 files changed, 15 insertions(+), 45 deletions(-)

diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
index d9566261e..bd6ae7c22 100644
--- a/tests/test_sequence_tagger.py
+++ b/tests/test_sequence_tagger.py
@@ -345,6 +345,7 @@ def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
 
 @pytest.mark.integration
 def test_train_resume_tagger(results_base_path, tasks_base_path):
+
     corpus_1 = flair.datasets.ColumnCorpus(
         data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
@@ -361,13 +362,16 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
         use_crf=False,
     )
 
+    # train model for 2 epochs
     trainer = ModelTrainer(model, corpus)
     trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)
 
-    del trainer, model
-    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus)
+    del model
 
-    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)
+    # load the checkpoint model and train until epoch 4
+    checkpoint_model = SequenceTagger.load(results_base_path / "checkpoint.pt")
+    trainer.resume(model=checkpoint_model,
+                   max_epochs=4)
 
     # clean up results directory
     shutil.rmtree(results_base_path)
diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py
index 76f69d4a8..99d10c719 100644
--- a/tests/test_text_classifier.py
+++ b/tests/test_text_classifier.py
@@ -263,51 +263,17 @@ def test_train_resume_classifier(results_base_path, tasks_base_path):
                            multi_label=False,
                            label_type="topic")
 
+    # train model for 2 epochs
     trainer = ModelTrainer(model, corpus)
     trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)
 
-    del trainer, model
-    trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus)
-    trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True)
+    del model
+
+    # load the checkpoint model and train until epoch 4
+    checkpoint_model = TextClassifier.load(results_base_path / "checkpoint.pt")
+    trainer.resume(model=checkpoint_model,
+                   max_epochs=4)
 
     # clean up results directory
     shutil.rmtree(results_base_path)
-    del trainer
-
-
-# def test_labels_to_indices(tasks_base_path):
-#     corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "ag_news", label_type="topic")
-#     label_dict = corpus.make_label_dictionary()
-#     model = TextClassifier(document_embeddings,
-#                            label_dictionary=label_dict,
-#                            label_type="topic",
-#                            multi_label=False)
-#
-#     result = model._labels_to_indices(corpus.train)
-#
-#     for i in range(len(corpus.train)):
-#         expected = label_dict.get_idx_for_item(corpus.train[i].labels[0].value)
-#         actual = result[i].item()
-#
-#         assert expected == actual
-#
-#
-# def test_labels_to_one_hot(tasks_base_path):
-#     corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "ag_news", label_type="topic")
-#     label_dict = corpus.make_label_dictionary()
-#     model = TextClassifier(document_embeddings,
-#                            label_dictionary=label_dict,
-#                            label_type="topic",
-#                            multi_label=False)
-#
-#     result = model._labels_to_one_hot(corpus.train)
-#
-#     for i in range(len(corpus.train)):
-#         expected = label_dict.get_idx_for_item(corpus.train[i].labels[0].value)
-#         actual = result[i]
-#
-#         for idx in range(len(label_dict)):
-#             if idx == expected:
-#                 assert actual[idx] == 1
-#             else:
-#                 assert actual[idx] == 0
+    del trainer
\ No newline at end of file