Skip to content

Commit

Permalink
updating doc2vec and word2vec to take pre-trained word embeddings. Fo…
Browse files Browse the repository at this point in the history
…rmat = original c code txt format
  • Loading branch information
jhlau committed Feb 9, 2016
1 parent 5c20189 commit 9dc0f79
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 6 deletions.
7 changes: 5 additions & 2 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,8 @@ class Doc2Vec(Word2Vec):
def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001,
dm=1, hs=1, negative=0, dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1,
docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs):
docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None,
pretrained_emb=None, **kwargs):
"""
Initialize the model from an iterable of `documents`. Each document is a
TaggedDocument object that will be used for training.
Expand Down Expand Up @@ -577,12 +578,14 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part
of the model.
`pretrained_emb` = takes in pre-trained embedding for word vectors; format = original C word2vec-tool non-binary format (i.e. one embedding per word)
"""
super(Doc2Vec, self).__init__(
size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size,
sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean,
null_word=dm_concat, **kwargs)
null_word=dm_concat, pretrained_emb=pretrained_emb, **kwargs)
self.dbow_words = dbow_words
self.dm_concat = dm_concat
self.dm_tag_count = dm_tag_count
Expand Down
42 changes: 38 additions & 4 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
from collections import defaultdict
import threading
import itertools
import time

from gensim.utils import keep_vocab_item

Expand Down Expand Up @@ -344,7 +345,8 @@ def __init__(
self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH):
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH,
pretrained_emb=None):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (unicode strings) that will be used for training.
Expand Down Expand Up @@ -409,6 +411,8 @@ def __init__(
thus cython routines). Default is 10000. (Larger batches can be passed if individual
texts are longer, but the cython code may truncate.)
`pretrained_emb` = takes in pre-trained embedding for word vectors; format = original C word2vec-tool non-binary format (i.e. one embedding per word)
"""
self.vocab = {} # mapping from a word (string) to a Vocab object
self.index2word = [] # map from a word's matrix index (int) to word (string)
Expand Down Expand Up @@ -437,6 +441,7 @@ def __init__(
self.total_train_time = 0
self.sorted_vocab = sorted_vocab
self.batch_words = batch_words
self.pretrained_emb = pretrained_emb

if sentences is not None:
if isinstance(sentences, GeneratorType):
Expand Down Expand Up @@ -978,12 +983,40 @@ def clear_sims(self):

def reset_weights(self):
"""Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""

#if pre-trained embedding file is given, load it
p_emb = {}
t = time.time()
if self.pretrained_emb != None:
logger.info("loading pre-trained embeddings")
with utils.smart_open(self.pretrained_emb) as fin:
header = utils.to_unicode(fin.readline(), encoding="utf8")
vocab_size, vector_size = map(int, header.split())
if vector_size != self.vector_size:
logger.info("pre-trained embedding vector size is different to the specified training vector size; pre-trained embeddings will be ignored")
else:
for line_no, line in enumerate(fin):
parts = utils.to_unicode(line.rstrip(), encoding="utf-8", errors="strict").split(" ")
if len(parts) != self.vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
word, weights = parts[0], list(map(REAL, parts[1:]))
if word in self.vocab:
p_emb[word] = weights
if line_no % 10000 == 0:
logger.info(str(line_no) + " lines processed (" + str(time.time()-t) + "s); " + str(len(p_emb)) + " embeddings collected")
t = time.time()

logger.info("resetting layer weights")
self.syn0 = empty((len(self.vocab), self.vector_size), dtype=REAL)
# randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
for i in xrange(len(self.vocab)):
# construct deterministic seed from word AND seed argument
self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed))
word = self.index2word[i]
if (len(p_emb) > 0) and (word in p_emb):
#use pre-trained embeddings
self.syn0[i] = p_emb[word]
else:
# construct deterministic seed from word AND seed argument
self.syn0[i] = self.seeded_vector(word + str(self.seed))
if self.hs:
self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL)
if self.negative:
Expand Down Expand Up @@ -1636,6 +1669,7 @@ def __iter__(self):
parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1])
parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1])
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
parser.add_argument("-pretrained_emb", help="Use pre-trained embeddings; format = original C word2vec-tool non-binary format (i.e. one embedding per word)")

args = parser.parse_args()

Expand All @@ -1646,7 +1680,7 @@ def __iter__(self):

corpus = LineSentence(args.train)

model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter)
model = Word2Vec(corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window,sample=args.sample,sg=skipgram,hs=args.hs,negative=args.negative,cbow_mean=1,iter=args.iter,pretrained_emb=args.pretrained_emb)

if args.output:
outfile = args.output
Expand Down

0 comments on commit 9dc0f79

Please sign in to comment.