Merge pull request #1332 from parulsethi/Fix_1310

Fix issue-1310
piskvorky · May 23, 2017 · a49aa9b · a49aa9b
2 parents e156ff9 + 4d413d7
commit a49aa9b
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 8 deletions.
diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py
@@ -8,7 +8,7 @@
 `Word2Vec` for that.
 
 Example:
->>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_path='wr_model')
+>>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model')
 >>> print model[word]  # prints vector for given words
 
 .. [1] https://bitbucket.org/shihaoji/wordrank/
@@ -45,14 +45,14 @@ class Wordrank(KeyedVectors):
     """
 
     @classmethod
-    def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
+    def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
               sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
               beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0):
         """
         `wr_path` is the path to the Wordrank directory.
         `corpus_file` is the filename of the text file to be used for training the Wordrank model.
         Expects file to contain space-separated tokens in a single line
-        `out_path` is the path to directory which will be created to save embeddings and training data.
+        `out_name` is name of the directory which will be created(in wordrank folder) to save embeddings and training data.
         `size` is the dimensionality of the feature vectors.
         `window` is the number of context words to the left (and to the right, if symmetric = 1).
         `symmetric` if 0, only use left context words, else use left and right both.
@@ -82,7 +82,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1,
         meta_file = 'meta'
 
         # prepare training data (cooccurrence matrix and vocab)
-        model_dir = os.path.join(wr_path, out_path)
+        model_dir = os.path.join(wr_path, out_name)
         meta_dir = os.path.join(model_dir, 'meta')
         os.makedirs(meta_dir)
         logger.info("Dumped data will be stored in '%s'", model_dir)
@@ -95,14 +95,16 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1,
         cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]
 
         commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
-        logger.info("Prepare training data using glove code '%s'", commands)
         input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file]
         output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]
 
+        logger.info("Prepare training data using glove code")
         for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
             with smart_open(input_fname, 'rb') as r:
                 with smart_open(output_fname, 'wb') as w:
                     utils.check_output(w, args=command, stdin=r)
+
+        logger.info("Delete frequencies from vocab file")
         with smart_open(vocab_file, 'wb') as w:
             utils.check_output(w, args=cmd_del_vocab_freq)
 
@@ -147,7 +149,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1,
         for option, value in wr_args.items():
             cmd.append('--%s' % option)
             cmd.append(str(value))
-        logger.info("Running wordrank binary '%s'", cmd)
+        logger.info("Running wordrank binary")
         output = utils.check_output(args=cmd)
 
         # use embeddings from max. iteration's dump

diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py
@@ -30,11 +30,11 @@ def setUp(self):
         wr_home = os.environ.get('WR_HOME', None)
         self.wr_path = wr_home if wr_home else None
         self.corpus_file = datapath('lee.cor')
-        self.out_path = 'testmodel'
+        self.out_name = 'testmodel'
         self.wr_file = datapath('test_glove.txt')
         if not self.wr_path:
             return
-        self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_path, iter=6, dump_period=5,period=5)
+        self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5)
 
     def testLoadWordrankFormat(self):
         """Test model successfully loaded from Wordrank format file"""

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -1164,6 +1164,7 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
     Added extra KeyboardInterrupt handling
     """
     try:
+        logger.debug("COMMAND: %s %s", str(popenargs), str(kwargs))
         process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
         output, unused_err = process.communicate()
         retcode = process.poll()