diff --git a/flair/__init__.py b/flair/__init__.py index 5702389c5..1dae2cd43 100644 --- a/flair/__init__.py +++ b/flair/__init__.py @@ -25,7 +25,7 @@ import logging.config -__version__ = "0.8" +__version__ = "0.8.1" logging.config.dictConfig( { diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 84846dcda..2f034057e 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -14,12 +14,14 @@ from .sequence_labeling import CONLL_03 from .sequence_labeling import CONLL_03_GERMAN from .sequence_labeling import CONLL_03_DUTCH +from .sequence_labeling import ICELANDIC_NER from .sequence_labeling import CONLL_03_SPANISH from .sequence_labeling import CONLL_2000 from .sequence_labeling import DANE from .sequence_labeling import EUROPARL_NER_GERMAN from .sequence_labeling import GERMEVAL_14 from .sequence_labeling import INSPEC +from .sequence_labeling import JAPANESE_NER from .sequence_labeling import LER_GERMAN from .sequence_labeling import MIT_MOVIE_NER_SIMPLE from .sequence_labeling import MIT_MOVIE_NER_COMPLEX @@ -56,6 +58,7 @@ from .sequence_labeling import WSD_UFSAC from .sequence_labeling import WNUT_2020_NER from .sequence_labeling import XTREME +from .sequence_labeling import REDDIT_EL_GOLD # Expose all document classification datasets from .document_classification import ClassificationCorpus diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index adeb7f2c2..c5b3cce3c 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -11,7 +11,7 @@ Corpus, Token, FlairDataset, - Tokenizer + Tokenizer, DataPair ) from flair.tokenization import SegtokTokenizer, SpaceTokenizer from flair.datasets.base import find_train_dev_test_files @@ -454,9 +454,12 @@ def __init__( # most data sets have the token text in the first column, if not, pass 'text' as column self.text_columns: List[int] = [] + self.pair_columns: List[int] = [] for column in column_name_map: if column_name_map[column] == "text": self.text_columns.append(column) + if column_name_map[column] == "pair": + self.pair_columns.append(column) with open(self.path_to_file, encoding=encoding) as csv_file: @@ -488,26 +491,8 @@ def __init__( if self.in_memory: - text = " ".join( - [row[text_column] for text_column in self.text_columns] - ) - - if self.max_chars_per_doc > 0: - text = text[: self.max_chars_per_doc] - - sentence = Sentence(text, use_tokenizer=self.tokenizer) - - for column in self.column_name_map: - column_value = row[column] - if ( - self.column_name_map[column].startswith("label") - and column_value - ): - if column_value != self.no_class_label: - sentence.add_label(label_type, column_value) + sentence = self._make_labeled_data_point(row) - if 0 < self.max_tokens_per_doc < len(sentence): - sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] self.sentences.append(sentence) else: @@ -515,6 +500,52 @@ def __init__( self.total_sentence_count += 1 + def _make_labeled_data_point(self, row): + + # make sentence from text (and filter for length) + text = " ".join( + [row[text_column] for text_column in self.text_columns] + ) + + if self.max_chars_per_doc > 0: + text = text[: self.max_chars_per_doc] + + sentence = Sentence(text, use_tokenizer=self.tokenizer) + + if 0 < self.max_tokens_per_doc < len(sentence): + sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] + + # if a pair column is defined, make a sentence pair object + if len(self.pair_columns) > 0: + + text = " ".join( + [row[pair_column] for pair_column in self.pair_columns] + ) + + if self.max_chars_per_doc > 0: + text = text[: self.max_chars_per_doc] + + pair = Sentence(text, use_tokenizer=self.tokenizer) + + if 0 < self.max_tokens_per_doc < len(sentence): + pair.tokens = pair.tokens[: self.max_tokens_per_doc] + + data_point = DataPair(first=sentence, second=pair) + + else: + data_point = sentence + + for column in self.column_name_map: + column_value = row[column] + if ( + self.column_name_map[column].startswith("label") + and column_value + ): + if column_value != self.no_class_label: + data_point.add_label(self.label_type, column_value) + + return data_point + def is_in_memory(self) -> bool: return self.in_memory @@ -527,20 +558,7 @@ def __getitem__(self, index: int = 0) -> Sentence: else: row = self.raw_data[index] - text = " ".join([row[text_column] for text_column in self.text_columns]) - - if self.max_chars_per_doc > 0: - text = text[: self.max_chars_per_doc] - - sentence = Sentence(text, use_tokenizer=self.tokenizer) - for column in self.column_name_map: - column_value = row[column] - if self.column_name_map[column].startswith("label") and column_value: - if column_value != self.no_class_label: - sentence.add_label(self.label_type, column_value) - - if 0 < self.max_tokens_per_doc < len(sentence): - sentence.tokens = sentence.tokens[: self.max_tokens_per_doc] + sentence = self._make_labeled_data_point(row) return sentence diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index f1f47427d..1dae398dc 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -2,8 +2,14 @@ import re import os import shutil +import glob from pathlib import Path from typing import Union, Dict, List +from os import listdir +import zipfile +from zipfile import ZipFile +import csv + import flair from flair.data import Corpus, MultiCorpus, FlairDataset, Sentence, Token @@ -592,6 +598,152 @@ def __offset_docstarts(file_in: Union[str, Path], file_out: Union[str, Path]): f.write("\n") + +class ICELANDIC_NER(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the ICELANDIC_NER corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + if not os.path.isfile(data_folder / 'icelandic_ner.txt'): + # download zip + icelandic_ner ="https://repository.clarin.is/repository/xmlui/handle/20.500.12537/42/allzip" + icelandic_ner_path = cached_path(icelandic_ner, Path("datasets") / dataset_name) + + #unpacking the zip + unpack_file( + icelandic_ner_path, + data_folder, + mode="zip", + keep=True + ) + outputfile = os.path.abspath(data_folder) + + #merge the files in one as the zip is containing multiples files + + with open(outputfile/data_folder/"icelandic_ner.txt", "wb") as outfile: + for files in os.walk(outputfile/data_folder): + f = files[2] + for i in range(len(f)): + if f[i].endswith('.txt'): + with open(outputfile/data_folder/f[i], 'rb') as infile: + contents = infile.read() + outfile.write(contents) + + + super(ICELANDIC_NER, self).__init__( + data_folder, + columns, + train_file='icelandic_ner.txt', + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + **corpusargs, + ) + +class JAPANESE_NER(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: 'text', 1: 'ner'} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data from github if necessary (hironsan.txt, ja.wikipedia.conll) + IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/" + + # download files if not present locally + cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw') + cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw') + + # we need to modify the original files by adding new lines after after the end of each sentence + train_data_file = data_folder / 'train.txt' + if not train_data_file.is_file(): + self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt') + self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt') + + super(JAPANESE_NER, self).__init__( + data_folder, + columns, + train_file='train.txt', + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + **corpusargs, + ) + + @staticmethod + def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]): + with open(file_in, 'r') as f: + lines = f.readlines() + with open(file_out, 'a') as f: + for line in lines: + if (line[0] == "。"): + f.write(line) + f.write("\n") + elif (line[0] == "\n"): + continue + else: + f.write(line) + + @staticmethod + def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]): + with open(file_in, 'r') as f: + lines = f.readlines() + with open(file_out, 'a') as f: + for line in lines: + sp_line = line.split("\t") + if (sp_line[0] == "\n"): + f.write("\n") + else: + f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1]) + class STACKOVERFLOW_NER(ColumnCorpus): def __init__( self, @@ -3445,3 +3597,222 @@ def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): else: liste = line.split() f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') + + +class REDDIT_EL_GOLD(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format. + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download and parse data if necessary + reddit_el_path = "https://zenodo.org/record/3970806/files/reddit_el.zip" + corpus_file_name = "reddit_el_gold.txt" + parsed_dataset = data_folder / corpus_file_name + + if not parsed_dataset.exists(): + reddit_el_zip = cached_path(f"{reddit_el_path}", Path("datasets") / dataset_name) + unpack_file(reddit_el_zip, data_folder, "zip", False) + + with open(data_folder / corpus_file_name, "w") as txtout: + + # First parse the post titles + with open(data_folder / "posts.tsv", "r") as tsvin1, open(data_folder / "gold_post_annotations.tsv", "r") as tsvin2: + + posts = csv.reader(tsvin1, delimiter="\t") + self.post_annotations = csv.reader(tsvin2, delimiter="\t") + self.curr_annot = next(self.post_annotations) + + for row in posts: # Go through all the post titles + + txtout.writelines("-DOCSTART-\n\n") # Start each post with a -DOCSTART- token + + # Keep track of how many and which entity mentions does a given post title have + link_annots = [] # [start pos, end pos, wiki page title] of an entity mention + + # Check if the current post title has an entity link and parse accordingly + if row[0] == self.curr_annot[0]: + + link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3])) + link_annots = self._fill_annot_array(link_annots, row[0], post_flag = True) + + # Post titles with entity mentions (if any) are handled via this function + self._text_to_cols(Sentence(row[2], use_tokenizer = True), link_annots, txtout) + else: + self._text_to_cols(Sentence(row[2], use_tokenizer = True), link_annots, txtout) + + # Then parse the comments + with open(data_folder / "comments.tsv", "r") as tsvin3, open(data_folder / "gold_comment_annotations.tsv", "r") as tsvin4: + + self.comments = csv.reader(tsvin3, delimiter="\t") + self.comment_annotations = csv.reader(tsvin4, delimiter="\t") + self.curr_annot = next(self.comment_annotations) + self.curr_row = next(self.comments) + self.stop_iter = False + + # Iterate over the comments.tsv file, until the end is reached + while not self.stop_iter: + + txtout.writelines("-DOCSTART-\n") # Start each comment thread with a -DOCSTART- token + + # Keep track of the current comment thread and its corresponding key, on which the annotations are matched. + # Each comment thread is handled as one 'document'. + self.curr_comm = self.curr_row[4] + comm_key = self.curr_row[0] + + # Python's csv package for some reason fails to correctly parse a handful of rows inside the comments.tsv file. + # This if-condition is needed to handle this problem. + if comm_key in {"en5rf4c", "es3ia8j", "es3lrmw"}: + if comm_key == "en5rf4c": + self.parsed_row = (r.split("\t") for r in self.curr_row[4].split("\n")) + self.curr_comm = next(self.parsed_row) + self._fill_curr_comment(fix_flag = True) + # In case we are dealing with properly parsed rows, proceed with a regular parsing procedure + else: + self._fill_curr_comment(fix_flag = False) + + link_annots = [] # [start pos, end pos, wiki page title] of an entity mention + + # Check if the current comment thread has an entity link and parse accordingly, same as with post titles above + if comm_key == self.curr_annot[0]: + link_annots.append((int(self.curr_annot[4]), int(self.curr_annot[5]), self.curr_annot[3])) + link_annots = self._fill_annot_array(link_annots, comm_key, post_flag = False) + self._text_to_cols(Sentence(self.curr_comm, use_tokenizer = True), link_annots, txtout) + else: + # In two of the comment thread a case of capital letter spacing occurs, which the SegtokTokenizer cannot properly handle. + # The following if-elif condition handles these two cases and as result writes full capitalized words in each corresponding row, + # and not just single letters into single rows. + if comm_key == "dv74ybb": + self.curr_comm = " ".join([word.replace(" ", "") for word in self.curr_comm.split(" ")]) + elif comm_key == "eci2lut": + self.curr_comm = (self.curr_comm[:18] + self.curr_comm[18:27].replace(" ", "") + self.curr_comm[27:55] + + self.curr_comm[55:68].replace(" ", "") + self.curr_comm[68:85] + self.curr_comm[85:92].replace(" ", "") + + self.curr_comm[92:]) + + self._text_to_cols(Sentence(self.curr_comm, use_tokenizer = True), link_annots, txtout) + + super(REDDIT_EL_GOLD, self).__init__( + data_folder, + columns, + train_file=corpus_file_name, + column_delimiter="\t", + in_memory=in_memory, + document_separator_token="-DOCSTART-", + **corpusargs, + ) + + def _text_to_cols(self, sentence: Sentence, links: list, outfile): + """ + Convert a tokenized sentence into column format + :param sentence: Flair Sentence object containing a tokenized post title or comment thread + :param links: array containing information about the starting and ending position of an entity mention, as well + as its corresponding wiki tag + :param outfile: file, to which the output is written + """ + for i in range(0, len(sentence)): + # If there are annotated entity mentions for given post title or a comment thread + if links: + # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence + link_index = [j for j,v in enumerate(links) if (sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1])] + # Write the token with a corresponding tag to file + try: + if any(sentence[i].start_pos == v[0] and sentence[i].end_pos == v[1] for j,v in enumerate(links)): + outfile.writelines(sentence[i].text + "\tS-Link:" + links[link_index[0]][2] + "\n") + elif any(sentence[i].start_pos == v[0] and sentence[i].end_pos != v[1] for j,v in enumerate(links)): + outfile.writelines(sentence[i].text + "\tB-Link:" + links[link_index[0]][2] + "\n") + elif any(sentence[i].start_pos >= v[0] and sentence[i].end_pos <= v[1] for j,v in enumerate(links)): + outfile.writelines(sentence[i].text + "\tI-Link:" + links[link_index[0]][2] + "\n") + else: + outfile.writelines(sentence[i].text + "\tO\n") + # IndexError is raised in cases when there is exactly one link in a sentence, therefore can be dismissed + except IndexError: + pass + + # If a comment thread or a post title has no entity link, all tokens are assigned the O tag + else: + outfile.writelines(sentence[i].text + "\tO\n") + + # Prevent writing empty lines if e.g. a quote comes after a dot or initials are tokenized + # incorrectly, in order to keep the desired format (empty line as a sentence separator). + try: + if ((sentence[i].text in {".", "!", "?", "!*"}) and + (sentence[i+1].text not in {'"', '“', "'", "''", "!", "?", ";)", "."}) and + ("." not in sentence[i-1].text)): + outfile.writelines("\n") + except IndexError: + # Thrown when the second check above happens, but the last token of a sentence is reached. + # Indicates that the EOS punctuaion mark is present, therefore an empty line needs to be written below. + outfile.writelines("\n") + + # If there is no punctuation mark indicating EOS, an empty line is still needed after the EOS + if sentence[-1].text not in {".", "!", "?"}: + outfile.writelines("\n") + + def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list: + """ + Fills the array containing information about the entity mention annotations, used in the _text_to_cols method + :param annot_array: array to be filled + :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation + :param post_flag: flag indicating whether the annotations are collected for the post titles (=True) + or comment threads (=False) + """ + next_annot = None + while True: + # Check if further annotations belong to the current post title or comment thread as well + try: + next_annot = next(self.post_annotations) if post_flag else next(self.comment_annotations) + if next_annot[0] == key: + annot_array.append((int(next_annot[4]), int(next_annot[5]), next_annot[3])) + else: + self.curr_annot = next_annot + break + # Stop when the end of an annotation file is reached + except StopIteration: + break + return annot_array + + def _fill_curr_comment(self, fix_flag: bool): + """ + Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the + comments are parsed. + :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True) + or regular rows (=False) + """ + next_row = None + while True: + # Check if further annotations belong to the current sentence as well + try: + next_row = next(self.comments) if not fix_flag else next(self.parsed_row) + if len(next_row) < 2: + # 'else " "' is needed to keep the proper token positions (for accordance with annotations) + self.curr_comm += next_row[0] if any(next_row) else " " + else: + self.curr_row = next_row + break + except StopIteration: # When the end of the comments.tsv file is reached + self.curr_row = next_row + self.stop_iter = True if not fix_flag else False + break diff --git a/flair/embeddings/__init__.py b/flair/embeddings/__init__.py index 9776320ad..84a064d01 100644 --- a/flair/embeddings/__init__.py +++ b/flair/embeddings/__init__.py @@ -26,6 +26,7 @@ from .document import DocumentTFIDFEmbeddings from .document import DocumentRNNEmbeddings from .document import DocumentLMEmbeddings +from .document import DocumentCNNEmbeddings from .document import SentenceTransformerDocumentEmbeddings # Expose image embedding classes diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py index f743aed7f..9811bb377 100644 --- a/flair/embeddings/document.py +++ b/flair/embeddings/document.py @@ -217,7 +217,8 @@ def __setstate__(self, d): if "config_state_dict" in d: # load transformer model - config_class = CONFIG_MAPPING[d["config_state_dict"]["model_type"]] + model_type = d["config_state_dict"]["model_type"] if "model_type" in d["config_state_dict"] else "bert" + config_class = CONFIG_MAPPING[model_type] loaded_config = config_class.from_dict(d["config_state_dict"]) # constructor arguments @@ -585,6 +586,59 @@ def _apply(self, fn): child_module._apply(fn) + def __getstate__(self): + + # serialize the language models and the constructor arguments (but nothing else) + model_state = { + "state_dict": self.state_dict(), + + "embeddings": self.embeddings.embeddings, + "hidden_size": self.rnn.hidden_size, + "rnn_layers": self.rnn.num_layers, + "reproject_words": self.reproject_words, + "reproject_words_dimension": self.embeddings_dimension, + "bidirectional": self.bidirectional, + "dropout": self.dropout.p if self.dropout is not None else 0., + "word_dropout": self.word_dropout.p if self.word_dropout is not None else 0., + "locked_dropout": self.locked_dropout.p if self.locked_dropout is not None else 0., + "rnn_type": self.rnn_type, + "fine_tune": not self.static_embeddings, + } + + return model_state + + def __setstate__(self, d): + + # special handling for deserializing language models + if "state_dict" in d: + + # re-initialize language model with constructor arguments + language_model = DocumentRNNEmbeddings( + embeddings=d['embeddings'], + hidden_size=d['hidden_size'], + rnn_layers=d['rnn_layers'], + reproject_words=d['reproject_words'], + reproject_words_dimension=d['reproject_words_dimension'], + bidirectional=d['bidirectional'], + dropout=d['dropout'], + word_dropout=d['word_dropout'], + locked_dropout=d['locked_dropout'], + rnn_type=d['rnn_type'], + fine_tune=d['fine_tune'], + ) + + language_model.load_state_dict(d['state_dict']) + + # copy over state dictionary to self + for key in language_model.__dict__.keys(): + self.__dict__[key] = language_model.__dict__[key] + + # set the language model to eval() by default (this is necessary since FlairEmbeddings "protect" the LM + # in their "self.train()" method) + self.eval() + + else: + self.__dict__ = d class DocumentLMEmbeddings(DocumentEmbeddings): def __init__(self, flair_embeddings: List[FlairEmbeddings]): @@ -688,3 +742,162 @@ def _add_embeddings_to_sentences(self, sentences: List[Sentence]): def embedding_length(self) -> int: """Returns the length of the embedding vector.""" return self.model.get_sentence_embedding_dimension() + + +class DocumentCNNEmbeddings(DocumentEmbeddings): + def __init__( + self, + embeddings: List[TokenEmbeddings], + kernels=((100, 3), (100, 4), (100, 5)), + reproject_words: bool = True, + reproject_words_dimension: int = None, + dropout: float = 0.5, + word_dropout: float = 0.0, + locked_dropout: float = 0.0, + fine_tune: bool = True, + ): + """The constructor takes a list of embeddings to be combined. + :param embeddings: a list of token embeddings + :param kernels: list of (number of kernels, kernel size) + :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear + layer before putting them into the rnn or not + :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output + dimension as before will be taken. + :param dropout: the dropout value to be used + :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used + :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used + """ + super().__init__() + + self.embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embeddings) + self.length_of_all_token_embeddings: int = self.embeddings.embedding_length + + self.kernels = kernels + self.reproject_words = reproject_words + + self.static_embeddings = False if fine_tune else True + + self.embeddings_dimension: int = self.length_of_all_token_embeddings + if self.reproject_words and reproject_words_dimension is not None: + self.embeddings_dimension = reproject_words_dimension + + self.word_reprojection_map = torch.nn.Linear( + self.length_of_all_token_embeddings, self.embeddings_dimension + ) + + # CNN + self.__embedding_length: int = sum([kernel_num for kernel_num, kernel_size in self.kernels]) + self.convs = torch.nn.ModuleList( + [ + torch.nn.Conv1d(self.embeddings_dimension, kernel_num, kernel_size) for kernel_num, kernel_size in self.kernels + ] + ) + self.pool = torch.nn.AdaptiveMaxPool1d(1) + + self.name = "document_cnn" + + # dropouts + self.dropout = torch.nn.Dropout(dropout) if dropout > 0.0 else None + self.locked_dropout = ( + LockedDropout(locked_dropout) if locked_dropout > 0.0 else None + ) + self.word_dropout = WordDropout(word_dropout) if word_dropout > 0.0 else None + + torch.nn.init.xavier_uniform_(self.word_reprojection_map.weight) + + self.to(flair.device) + + self.eval() + + @property + def embedding_length(self) -> int: + return self.__embedding_length + + def _add_embeddings_internal(self, sentences: Union[List[Sentence], Sentence]): + """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update + only if embeddings are non-static.""" + + # TODO: remove in future versions + if not hasattr(self, "locked_dropout"): + self.locked_dropout = None + if not hasattr(self, "word_dropout"): + self.word_dropout = None + + if type(sentences) is Sentence: + sentences = [sentences] + + self.zero_grad() # is it necessary? + + # embed words in the sentence + self.embeddings.embed(sentences) + + lengths: List[int] = [len(sentence.tokens) for sentence in sentences] + longest_token_sequence_in_batch: int = max(lengths) + + pre_allocated_zero_tensor = torch.zeros( + self.embeddings.embedding_length * longest_token_sequence_in_batch, + dtype=torch.float, + device=flair.device, + ) + + all_embs: List[torch.Tensor] = list() + for sentence in sentences: + all_embs += [ + emb for token in sentence for emb in token.get_each_embedding() + ] + nb_padding_tokens = longest_token_sequence_in_batch - len(sentence) + + if nb_padding_tokens > 0: + t = pre_allocated_zero_tensor[ + : self.embeddings.embedding_length * nb_padding_tokens + ] + all_embs.append(t) + + sentence_tensor = torch.cat(all_embs).view( + [ + len(sentences), + longest_token_sequence_in_batch, + self.embeddings.embedding_length, + ] + ) + + # before-RNN dropout + if self.dropout: + sentence_tensor = self.dropout(sentence_tensor) + if self.locked_dropout: + sentence_tensor = self.locked_dropout(sentence_tensor) + if self.word_dropout: + sentence_tensor = self.word_dropout(sentence_tensor) + + # reproject if set + if self.reproject_words: + sentence_tensor = self.word_reprojection_map(sentence_tensor) + + # push CNN + x = sentence_tensor + x = x.permute(0, 2, 1) + + rep = [self.pool(torch.nn.functional.relu(conv(x))) for conv in self.convs] + outputs = torch.cat(rep, 1) + + outputs = outputs.reshape(outputs.size(0), -1) + + # after-CNN dropout + if self.dropout: + outputs = self.dropout(outputs) + if self.locked_dropout: + outputs = self.locked_dropout(outputs) + + # extract embeddings from CNN + for sentence_no, length in enumerate(lengths): + embedding = outputs[sentence_no] + + if self.static_embeddings: + embedding = embedding.detach() + + sentence = sentences[sentence_no] + sentence.set_embedding(self.name, embedding) + + def _apply(self, fn): + for child_module in self.children(): + child_module._apply(fn) \ No newline at end of file diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index 4692dd402..c0d74bc82 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -409,8 +409,8 @@ def __init__(self, "multi-backward": f"{hu_path}/lm-jw300-backward-v0.1.pt", "multi-v0-forward": f"{hu_path}/lm-multi-forward-v0.1.pt", "multi-v0-backward": f"{hu_path}/lm-multi-backward-v0.1.pt", - "multi-v0-forward-fast": f"{hu_path}/lm-multi-forward-fast-v0.1.pt", - "multi-v0-backward-fast": f"{hu_path}/lm-multi-backward-fast-v0.1.pt", + "multi-forward-fast": f"{hu_path}/lm-multi-forward-fast-v0.1.pt", + "multi-backward-fast": f"{hu_path}/lm-multi-backward-fast-v0.1.pt", # English models "en-forward": f"{hu_path}/news-forward-0.4.1.pt", "en-backward": f"{hu_path}/news-backward-0.4.1.pt", @@ -591,6 +591,7 @@ def train(self, mode=True): if "chars_per_chunk" not in self.__dict__: self.chars_per_chunk = 512 + # unless fine-tuning is set, do not set language model to train() in order to disallow language model dropout if not self.fine_tune: pass else: @@ -1274,7 +1275,8 @@ def __setstate__(self, d): if "config_state_dict" in d: # load transformer model - config_class = CONFIG_MAPPING[d["config_state_dict"]["model_type"]] + model_type = d["config_state_dict"]["model_type"] if "model_type" in d["config_state_dict"] else "bert" + config_class = CONFIG_MAPPING[model_type] loaded_config = config_class.from_dict(d["config_state_dict"]) # constructor arguments diff --git a/flair/models/language_model.py b/flair/models/language_model.py index 1c632c2da..27f4b245e 100644 --- a/flair/models/language_model.py +++ b/flair/models/language_model.py @@ -399,6 +399,54 @@ def calculate_perplexity(self, text: str) -> float: return perplexity + def __getstate__(self): + + # serialize the language models and the constructor arguments (but nothing else) + model_state = { + "state_dict": self.state_dict(), + + "dictionary": self.dictionary, + "is_forward_lm": self.is_forward_lm, + "hidden_size": self.hidden_size, + "nlayers": self.nlayers, + "embedding_size": self.embedding_size, + "nout": self.nout, + "document_delimiter": self.document_delimiter, + "dropout": self.dropout, + } + + return model_state + + def __setstate__(self, d): + + # special handling for deserializing language models + if "state_dict" in d: + + # re-initialize language model with constructor arguments + language_model = LanguageModel( + dictionary=d['dictionary'], + is_forward_lm=d['is_forward_lm'], + hidden_size=d['hidden_size'], + nlayers=d['nlayers'], + embedding_size=d['embedding_size'], + nout=d['nout'], + document_delimiter=d['document_delimiter'], + dropout=d['dropout'], + ) + + language_model.load_state_dict(d['state_dict']) + + # copy over state dictionary to self + for key in language_model.__dict__.keys(): + self.__dict__[key] = language_model.__dict__[key] + + # set the language model to eval() by default (this is necessary since FlairEmbeddings "protect" the LM + # in their "self.train()" method) + self.eval() + + else: + self.__dict__ = d + def _apply(self, fn): # models that were serialized using torch versions older than 1.4.0 lack the _flat_weights_names attribute diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index 3e95153f1..884a05f38 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -53,7 +53,7 @@ def __init__( super(TextClassifier, self).__init__() - self.document_embeddings: flair.embeddings.DocumentRNNEmbeddings = document_embeddings + self.document_embeddings: flair.embeddings.DocumentEmbeddings = document_embeddings self.label_dictionary: Dictionary = label_dictionary self.label_type = label_type @@ -474,7 +474,7 @@ def _fetch_model(model_name) -> str: hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["de-offensive-language"] = "/".join( - [hu_path, "de-offensive-language", "germ-eval-2018-task-1-v0.5.pt"] + [hu_path, "de-offensive-language", "germ-eval-2018-task-1-v0.8.pt"] ) # English sentiment models @@ -485,7 +485,7 @@ def _fetch_model(model_name) -> str: [hu_path, "sentiment-curated-distilbert", "sentiment-en-mix-distillbert_4.pt"] ) model_map["sentiment-fast"] = "/".join( - [hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn.pt"] + [hu_path, "sentiment-curated-fasttext-rnn", "sentiment-en-mix-ft-rnn_v8.pt"] ) # Communicative Functions Model @@ -558,6 +558,15 @@ def __init__( nn.init.xavier_uniform_(self.decoder.weight) + # else, set separator to concatenate two sentences + else: + self.sep = ' ' + if isinstance(self.document_embeddings, flair.embeddings.document.TransformerDocumentEmbeddings): + if self.document_embeddings.tokenizer.sep_token: + self.sep = ' ' + str(self.document_embeddings.tokenizer.sep_token) + ' ' + else: + self.sep = ' [SEP] ' + def _get_state_dict(self): model_state = super()._get_state_dict() model_state["bi_mode"] = self.bi_mode @@ -605,15 +614,12 @@ def forward(self, datapairs): else: # concatenate the sentences and embed together - # TODO: Transformers use special separator symbols in the beginning and between elements - # of datapair. Here should be a case dinstintion between the different transformers. - if isinstance(self.document_embeddings, flair.embeddings.document.TransformerDocumentEmbeddings): - sep = '[SEP]' - else: - sep = ' ' - - concatenated_sentences = [Sentence(pair.first.to_plain_string() + sep + pair.second.to_plain_string()) for - pair in datapairs] + concatenated_sentences = [ + Sentence( + pair.first.to_tokenized_string() + self.sep + pair.second.to_tokenized_string(), + use_tokenizer=False + ) + for pair in datapairs] self.document_embeddings.embed(concatenated_sentences) @@ -900,11 +906,11 @@ def _get_state_dict(self): def _init_model_with_state_dict(state): task_name = state["current_task"] print("init TARS") - + # init new TARS classifier model = TARSClassifier( task_name, - label_dictionary = state["task_specific_attributes"][task_name]['label_dictionary'], + label_dictionary=state["task_specific_attributes"][task_name]['label_dictionary'], document_embeddings=state["tars_model"].document_embeddings, num_negative_labels_to_sample=state["num_negative_labels_to_sample"], ) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index f797ff2b0..8233112ed 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -39,12 +39,12 @@ class ModelTrainer: def __init__( - self, - model: flair.nn.Model, - corpus: Corpus, - optimizer: torch.optim.Optimizer = SGD, - epoch: int = 0, - use_tensorboard: bool = False, + self, + model: flair.nn.Model, + corpus: Corpus, + optimizer: torch.optim.Optimizer = SGD, + epoch: int = 0, + use_tensorboard: bool = False, ): """ Initialize a model trainer @@ -61,40 +61,40 @@ def __init__( self.use_tensorboard: bool = use_tensorboard def train( - self, - base_path: Union[Path, str], - learning_rate: float = 0.1, - mini_batch_size: int = 32, - mini_batch_chunk_size: int = None, - max_epochs: int = 100, - scheduler = AnnealOnPlateau, - cycle_momentum: bool = False, - anneal_factor: float = 0.5, - patience: int = 3, - initial_extra_patience = 0, - min_learning_rate: float = 0.0001, - train_with_dev: bool = False, - train_with_test: bool = False, - monitor_train: bool = False, - monitor_test: bool = False, - embeddings_storage_mode: str = "cpu", - checkpoint: bool = False, - save_final_model: bool = True, - anneal_with_restarts: bool = False, - anneal_with_prestarts: bool = False, - batch_growth_annealing: bool = False, - shuffle: bool = True, - param_selection_mode: bool = False, - write_weights: bool = False, - num_workers: int = 6, - sampler=None, - use_amp: bool = False, - amp_opt_level: str = "O1", - eval_on_train_fraction=0.0, - eval_on_train_shuffle=False, - save_model_at_each_epoch=False, - main_score_type=("micro avg", 'f1-score'), - **kwargs, + self, + base_path: Union[Path, str], + learning_rate: float = 0.1, + mini_batch_size: int = 32, + mini_batch_chunk_size: int = None, + max_epochs: int = 100, + scheduler=AnnealOnPlateau, + cycle_momentum: bool = False, + anneal_factor: float = 0.5, + patience: int = 3, + initial_extra_patience=0, + min_learning_rate: float = 0.0001, + train_with_dev: bool = False, + train_with_test: bool = False, + monitor_train: bool = False, + monitor_test: bool = False, + embeddings_storage_mode: str = "cpu", + checkpoint: bool = False, + save_final_model: bool = True, + anneal_with_restarts: bool = False, + anneal_with_prestarts: bool = False, + batch_growth_annealing: bool = False, + shuffle: bool = True, + param_selection_mode: bool = False, + write_weights: bool = False, + num_workers: int = 6, + sampler=None, + use_amp: bool = False, + amp_opt_level: str = "O1", + eval_on_train_fraction=0.0, + eval_on_train_shuffle=False, + save_model_each_k_epochs: int = 0, + main_score_type=("micro avg", 'f1-score'), + **kwargs, ) -> dict: """ Trains any class that implements the flair.nn.Model interface. @@ -127,7 +127,9 @@ def train( if 'dev' the size is determined from dev set size :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training and kept fixed during training, otherwise it's sampled at beginning of each epoch - :param save_model_at_each_epoch: If True, at each epoch the thus far trained model will be saved + :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will + be saved each 5 epochs. Default is 0 which means no model saving. + :param save_model_epoch_step: Each save_model_epoch_step'th epoch the thus far trained model will be saved :param main_score_type: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used) :param kwargs: Other arguments for the Optimizer :return: @@ -237,17 +239,18 @@ def train( # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev else "max" - + if scheduler == OneCycleLR: dataset_size = len(self.corpus.train) if train_with_dev: dataset_size += len(self.corpus.dev) lr_scheduler = OneCycleLR(optimizer, - max_lr=learning_rate, - steps_per_epoch=dataset_size//mini_batch_size + 1, - epochs=max_epochs-self.epoch, # if we load a checkpoint, we have already trained for self.epoch - pct_start=0.0, - cycle_momentum=cycle_momentum) + max_lr=learning_rate, + steps_per_epoch=dataset_size // mini_batch_size + 1, + epochs=max_epochs - self.epoch, + # if we load a checkpoint, we have already trained for self.epoch + pct_start=0.0, + cycle_momentum=cycle_momentum) else: lr_scheduler = scheduler( optimizer, @@ -257,7 +260,7 @@ def train( mode=anneal_mode, verbose=True, ) - + if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing): raise ValueError("Batch growth with OneCycle policy is not implemented.") @@ -318,9 +321,9 @@ def train( # reload last best model if annealing with restarts is enabled if ( - (anneal_with_restarts or anneal_with_prestarts) - and learning_rate != previous_learning_rate - and (base_path / "best-model.pt").exists() + (anneal_with_restarts or anneal_with_prestarts) + and learning_rate != previous_learning_rate + and (base_path / "best-model.pt").exists() ): if anneal_with_restarts: log.info("resetting to best model") @@ -345,7 +348,7 @@ def train( batch_loader = DataLoader( train_data, batch_size=mini_batch_size, - shuffle=shuffle if self.epoch > 1 else False, # never shuffle the first epoch + shuffle=shuffle if self.epoch > 1 else False, # never shuffle the first epoch num_workers=num_workers, sampler=sampler, ) @@ -373,7 +376,7 @@ def train( batch_steps = [batch] if len(batch) > micro_batch_size: batch_steps = [ - batch[x : x + micro_batch_size] + batch[x: x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) ] @@ -393,7 +396,7 @@ def train( # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() - + # do the scheduler step if one-cycle if isinstance(lr_scheduler, OneCycleLR): lr_scheduler.step() @@ -401,7 +404,7 @@ def train( for group in optimizer.param_groups: learning_rate = group["lr"] if "momentum" in group: - momentum = group["momentum"] + momentum = group["momentum"] seen_batches += 1 train_loss += loss.item() @@ -591,11 +594,11 @@ def train( # if we use dev data, remember best model based on dev evaluation score if ( - (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) - and not param_selection_mode - and not isinstance(lr_scheduler, OneCycleLR) - and current_score == lr_scheduler.best - and bad_epochs == 0 + (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) + and not param_selection_mode + and not isinstance(lr_scheduler, OneCycleLR) + and current_score == lr_scheduler.best + and bad_epochs == 0 ): print("saving best model") self.model.save(base_path / "best-model.pt") @@ -605,8 +608,8 @@ def train( self.model.load_state_dict(last_epoch_model_state_dict) self.model.save(base_path / "pre-best-model.pt") self.model.load_state_dict(current_state_dict) - - if save_model_at_each_epoch: + + if save_model_each_k_epochs > 0 and not self.epoch % save_model_each_k_epochs: print("saving model of current epoch") model_name = "model_epoch_" + str(self.epoch) + ".pt" self.model.save(base_path / model_name) @@ -659,7 +662,7 @@ def load_checkpoint(cls, checkpoint: Union[Path, str], corpus: Corpus): return model def final_test( - self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8 + self, base_path: Union[Path, str], eval_mini_batch_size: int, num_workers: int = 8 ): if type(base_path) is str: base_path = Path(base_path) @@ -708,16 +711,16 @@ def final_test( return final_score def find_learning_rate( - self, - base_path: Union[Path, str], - file_name: str = "learning_rate.tsv", - start_learning_rate: float = 1e-7, - end_learning_rate: float = 10, - iterations: int = 100, - mini_batch_size: int = 32, - stop_early: bool = True, - smoothing_factor: float = 0.98, - **kwargs, + self, + base_path: Union[Path, str], + file_name: str = "learning_rate.tsv", + start_learning_rate: float = 1e-7, + end_learning_rate: float = 10, + iterations: int = 100, + mini_batch_size: int = 32, + stop_early: bool = True, + smoothing_factor: float = 0.98, + **kwargs, ) -> Path: best_loss = None moving_avg_loss = 0 @@ -768,11 +771,11 @@ def find_learning_rate( else: if smoothing_factor > 0: moving_avg_loss = ( - smoothing_factor * moving_avg_loss - + (1 - smoothing_factor) * loss_item + smoothing_factor * moving_avg_loss + + (1 - smoothing_factor) * loss_item ) loss_item = moving_avg_loss / ( - 1 - smoothing_factor ** (step + 1) + 1 - smoothing_factor ** (step + 1) ) if loss_item < best_loss: best_loss = loss diff --git a/requirements.txt b/requirements.txt index 128ef661c..e66f458d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,5 +20,4 @@ sentencepiece==0.1.95 konoha<5.0.0,>=4.0.0 janome gdown==3.12.2 -numpy<1.20.0 huggingface-hub diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index cc0af3f4c..80531ebc2 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -173,6 +173,7 @@ data the first time you call the corresponding constructor ID. The following dat | 'CONLL_03_SPANISH' | Spanish | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | +| 'JAPANESE_NER' | Japanese | [https://github.com/Hironsan/IOB2Corpus] Japanese NER dataset automatically generated from Wikipedia | | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | | 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER | | 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER | diff --git a/setup.py b/setup.py index 1bf6d311a..5cfa85a62 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flair", - version="0.8", + version="0.8.1", description="A very simple framework for state-of-the-art NLP", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py index 10ea99650..3de2c132d 100644 --- a/tests/test_embeddings.py +++ b/tests/test_embeddings.py @@ -9,6 +9,7 @@ FlairEmbeddings, DocumentRNNEmbeddings, DocumentLMEmbeddings, TransformerWordEmbeddings, TransformerDocumentEmbeddings, + DocumentCNNEmbeddings, ) from flair.data import Sentence, Dictionary @@ -287,4 +288,21 @@ def test_transformer_document_embeddings(): sentence.clear_embeddings() + del embeddings + +def test_document_cnn_embeddings(): + sentence: Sentence = Sentence("I love Berlin. Berlin is a great place to live.") + + embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings( + [glove, flair_embedding], kernels=((50, 2), (50, 3)) + ) + + embeddings.embed(sentence) + + assert len(sentence.get_embedding()) == 100 + assert len(sentence.get_embedding()) == embeddings.embedding_length + + sentence.clear_embeddings() + + assert len(sentence.get_embedding()) == 0 del embeddings \ No newline at end of file