Skip to content

Commit

Permalink
Merge pull request #1332 from parulsethi/Fix_1310
Browse files Browse the repository at this point in the history
Fix issue-1310
  • Loading branch information
menshikh-iv authored May 23, 2017
2 parents e156ff9 + 4d413d7 commit a49aa9b
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
14 changes: 8 additions & 6 deletions gensim/models/wrappers/wordrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
`Word2Vec` for that.
Example:
>>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_path='wr_model')
>>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model')
>>> print model[word] # prints vector for given words
.. [1] https://bitbucket.org/shihaoji/wordrank/
Expand Down Expand Up @@ -45,14 +45,14 @@ class Wordrank(KeyedVectors):
"""

@classmethod
def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0):
"""
`wr_path` is the path to the Wordrank directory.
`corpus_file` is the filename of the text file to be used for training the Wordrank model.
Expects file to contain space-separated tokens in a single line
`out_path` is the path to directory which will be created to save embeddings and training data.
`out_name` is name of the directory which will be created(in wordrank folder) to save embeddings and training data.
`size` is the dimensionality of the feature vectors.
`window` is the number of context words to the left (and to the right, if symmetric = 1).
`symmetric` if 0, only use left context words, else use left and right both.
Expand Down Expand Up @@ -82,7 +82,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1,
meta_file = 'meta'

# prepare training data (cooccurrence matrix and vocab)
model_dir = os.path.join(wr_path, out_path)
model_dir = os.path.join(wr_path, out_name)
meta_dir = os.path.join(model_dir, 'meta')
os.makedirs(meta_dir)
logger.info("Dumped data will be stored in '%s'", model_dir)
Expand All @@ -95,14 +95,16 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1,
cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
logger.info("Prepare training data using glove code '%s'", commands)
input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file]
output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]

logger.info("Prepare training data using glove code")
for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
with smart_open(input_fname, 'rb') as r:
with smart_open(output_fname, 'wb') as w:
utils.check_output(w, args=command, stdin=r)

logger.info("Delete frequencies from vocab file")
with smart_open(vocab_file, 'wb') as w:
utils.check_output(w, args=cmd_del_vocab_freq)

Expand Down Expand Up @@ -147,7 +149,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1,
for option, value in wr_args.items():
cmd.append('--%s' % option)
cmd.append(str(value))
logger.info("Running wordrank binary '%s'", cmd)
logger.info("Running wordrank binary")
output = utils.check_output(args=cmd)

# use embeddings from max. iteration's dump
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_wordrank_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ def setUp(self):
wr_home = os.environ.get('WR_HOME', None)
self.wr_path = wr_home if wr_home else None
self.corpus_file = datapath('lee.cor')
self.out_path = 'testmodel'
self.out_name = 'testmodel'
self.wr_file = datapath('test_glove.txt')
if not self.wr_path:
return
self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_path, iter=6, dump_period=5,period=5)
self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5)

def testLoadWordrankFormat(self):
"""Test model successfully loaded from Wordrank format file"""
Expand Down
1 change: 1 addition & 0 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,7 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
Added extra KeyboardInterrupt handling
"""
try:
logger.debug("COMMAND: %s %s", str(popenargs), str(kwargs))
process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
Expand Down

0 comments on commit a49aa9b

Please sign in to comment.