diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index e2d481edee..5617ca1e25 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -516,7 +516,8 @@ class Doc2Vec(Word2Vec): def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0, dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1, - docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs): + docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, + pretrained_emb=None, **kwargs): """ Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. @@ -577,12 +578,14 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5, Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part of the model. + `pretrained_emb` = takes in pre-trained embedding for word vectors; format = original C word2vec-tool non-binary format (i.e. one embedding per word) + """ super(Doc2Vec, self).__init__( size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, - null_word=dm_concat, **kwargs) + null_word=dm_concat, pretrained_emb=pretrained_emb, **kwargs) self.dbow_words = dbow_words self.dm_concat = dm_concat self.dm_tag_count = dm_tag_count diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 3fa5b592ec..1cd2045742 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -77,6 +77,7 @@ from collections import defaultdict import threading import itertools +import time from gensim.utils import keep_vocab_item @@ -344,7 +345,8 @@ def __init__( self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH): + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, + pretrained_emb=None): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -409,6 +411,8 @@ def __init__( thus cython routines). Default is 10000. (Larger batches can be passed if individual texts are longer, but the cython code may truncate.) + `pretrained_emb` = takes in pre-trained embedding for word vectors; format = original C word2vec-tool non-binary format (i.e. one embedding per word) + """ self.vocab = {} # mapping from a word (string) to a Vocab object self.index2word = [] # map from a word's matrix index (int) to word (string) @@ -437,6 +441,7 @@ def __init__( self.total_train_time = 0 self.sorted_vocab = sorted_vocab self.batch_words = batch_words + self.pretrained_emb = pretrained_emb if sentences is not None: if isinstance(sentences, GeneratorType): @@ -978,12 +983,40 @@ def clear_sims(self): def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" + + #if pre-trained embedding file is given, load it + p_emb = {} + t = time.time() + if self.pretrained_emb != None: + logger.info("loading pre-trained embeddings") + with utils.smart_open(self.pretrained_emb) as fin: + header = utils.to_unicode(fin.readline(), encoding="utf8") + vocab_size, vector_size = map(int, header.split()) + if vector_size != self.vector_size: + logger.info("pre-trained embedding vector size is different to the specified training vector size; pre-trained embeddings will be ignored") + else: + for line_no, line in enumerate(fin): + parts = utils.to_unicode(line.rstrip(), encoding="utf-8", errors="strict").split(" ") + if len(parts) != self.vector_size + 1: + raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) + word, weights = parts[0], list(map(REAL, parts[1:])) + if word in self.vocab: + p_emb[word] = weights + if line_no % 10000 == 0: + logger.info(str(line_no) + " lines processed (" + str(time.time()-t) + "s); " + str(len(p_emb)) + " embeddings collected") + t = time.time() + logger.info("resetting layer weights") self.syn0 = empty((len(self.vocab), self.vector_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once for i in xrange(len(self.vocab)): - # construct deterministic seed from word AND seed argument - self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed)) + word = self.index2word[i] + if (len(p_emb) > 0) and (word in p_emb): + #use pre-trained embeddings + self.syn0[i] = p_emb[word] + else: + # construct deterministic seed from word AND seed argument + self.syn0[i] = self.seeded_vector(word + str(self.seed)) if self.hs: self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL) if self.negative: @@ -1636,6 +1669,7 @@ def __iter__(self): parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") + parser.add_argument("-pretrained_emb", help="Use pre-trained embeddings; format = original C word2vec-tool non-binary format (i.e. one embedding per word)") args = parser.parse_args() @@ -1646,7 +1680,7 @@ def __iter__(self): corpus = LineSentence(args.train) - model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter) + model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter,pretrained_emb=args.pretrained_emb) if args.output: outfile = args.output