piskvorky · menshikh-iv · Sep 14, 2018 · Jul 9, 2018 · Jul 9, 2018 · Jul 9, 2018
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -43,7 +43,6 @@
 from types import GeneratorType
 from gensim.utils import deprecated
 import warnings
-import itertools
 
 try:
     from queue import Queue
@@ -123,6 +122,9 @@ def _clear_post_train(self):
         """Resets certain properties of the model post training. eg. `keyedvectors.vectors_norm`."""
         raise NotImplementedError()
 
+    def _do_train_epoch(self, input_stream, thread_private_mem, cur_epoch, total_examples=None, total_words=None):
+        raise NotImplementedError()
+
     def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
         """Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
         raise NotImplementedError()
@@ -136,6 +138,16 @@ def _check_input_data_sanity(self, data_iterable=None, data_iterables=None):
         if not ((data_iterable is not None) ^ (data_iterables is not None)):
             raise ValueError("You must provide only one of singlestream or multistream arguments.")
 
+    def _worker_loop_multistream(self, input_stream, progress_queue, cur_epoch=0,
+                                 total_examples=None, total_words=None):
+        thread_private_mem = self._get_thread_working_mem()
+
+        examples, tally, raw_tally = self._do_train_epoch(input_stream, thread_private_mem, cur_epoch,
+                                                          total_examples=total_examples, total_words=total_words)
+
+        progress_queue.put((examples, tally, raw_tally))
+        progress_queue.put(None)
+
     def _worker_loop(self, job_queue, progress_queue):
         """Train the model, lifting batches of data from the queue.
 
@@ -258,8 +270,8 @@ def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_coun
     def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally):
         raise NotImplementedError()
 
-    def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
-                            report_delay=1.0):
+    def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None,
+                            total_words=None, report_delay=1.0):
         """Get the progress report for a single training epoch.
 
         Parameters
@@ -328,8 +340,32 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
         self.total_train_time += elapsed
         return trained_word_count, raw_word_count, job_tally
 
-    def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, total_examples=None,
-                     total_words=None, queue_factor=2, report_delay=1.0):
+    def _train_epoch_multistream(self, data_iterables, cur_epoch=0, total_examples=None, total_words=None):
+        assert len(data_iterables) == self.workers, "You have to pass the same amount of input streams as workers, " \
+                                                    "because each worker gets its own independent input stream."
+
+        progress_queue = Queue()
+
+        workers = [
+            threading.Thread(
+                target=self._worker_loop_multistream,
+                args=(input_stream, progress_queue,),
+                kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words}
+            ) for input_stream in data_iterables
+        ]
+
+        for thread in workers:
+            thread.daemon = True
+            thread.start()
+
+        trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
+            progress_queue=progress_queue, job_queue=None, cur_epoch=cur_epoch, total_examples=total_examples,
+            total_words=total_words)
+
+        return trained_word_count, raw_word_count, job_tally
+
+    def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None,
+                     queue_factor=2, report_delay=1.0):
         """Train the model for a single epoch.
 
         Parameters
@@ -361,7 +397,6 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
                 * Total word count used in training.
 
         """
-        self._check_input_data_sanity(data_iterable, data_iterables)
         job_queue = Queue(maxsize=queue_factor * self.workers)
         progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
 
@@ -372,9 +407,6 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
             for _ in xrange(self.workers)
         ]
 
-        # Chain all input streams into one, because multistream training is not supported yet.
-        if data_iterables is not None:
-            data_iterable = itertools.chain(*data_iterables)
         workers.append(threading.Thread(
             target=self._job_producer,
             args=(data_iterable, job_queue),
@@ -444,10 +476,14 @@ def train(self, data_iterable=None, data_iterables=None, epochs=None, total_exam
             for callback in self.callbacks:
                 callback.on_epoch_begin(self)
 
-            trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
-                data_iterable=data_iterable, data_iterables=data_iterables, cur_epoch=cur_epoch,
-                total_examples=total_examples, total_words=total_words, queue_factor=queue_factor,
-                report_delay=report_delay)
+            if data_iterable is not None:
+                trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
+                    data_iterable, cur_epoch=cur_epoch, total_examples=total_examples,
+                    total_words=total_words, queue_factor=queue_factor, report_delay=report_delay)
+            else:
+                trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_multistream(
+                    data_iterables, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words)
+
             trained_word_count += trained_word_count_epoch
             raw_word_count += raw_word_count_epoch
             job_tally += job_tally_epoch
@@ -550,6 +586,9 @@ def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=10
             consider an iterable that streams the sentences directly from disk/network.
             See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
             or :class:`~gensim.models.word2vec.LineSentence` for such examples.
+        input_streams : list or tuple of iterable of iterables
+            The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible
+            to process streams in parallel, using `workers` parameter.
         workers : int, optional
             Number of working threads, used for multiprocessing.
         vector_size : int, optional
@@ -928,6 +967,9 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
             consider an iterable that streams the sentences directly from disk/network.
             See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
             or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
+        input_streams : list or tuple of iterable of iterables
+            The tuple or list of `sentences`-like arguments. Use it if you have multiple input streams. It is possible
+            to process streams in parallel, using `workers` parameter.
         total_examples : int, optional
             Count of sentences.
         total_words : int, optional
@@ -1181,14 +1223,14 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
             logger.info(
                 "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
                 cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
-                utils.qsize(job_queue), utils.qsize(progress_queue)
+                None if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue)
             )
         else:
             # words-based progress %
             logger.info(
                 "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
                 cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
-                utils.qsize(job_queue), utils.qsize(progress_queue)
+                None if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue)
             )
 
     def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,

diff --git a/gensim/models/fast_line_sentence.cpp b/gensim/models/fast_line_sentence.cpp
@@ -0,0 +1,25 @@
+#include <stdexcept>
+#include "fast_line_sentence.h"
+
+
+FastLineSentence::FastLineSentence() : is_eof_(false) { }
+FastLineSentence::FastLineSentence(const std::string& filename) : filename_(filename), fs_(filename), is_eof_(false) { }
+
+std::vector<std::string> FastLineSentence::ReadSentence() {
+    if (is_eof_) {
+        return {};
+    }
+	std::string line, word;
+	std::getline(fs_, line);
+	std::vector<std::string> res;
+
+	std::istringstream iss(line);
+	while (iss >> word) {
+		res.push_back(word);
+	}
+
+    if (fs_.eof()) {
+        is_eof_ = true;
+    }
+	return res;
+}
diff --git a/gensim/models/fast_line_sentence.h b/gensim/models/fast_line_sentence.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+
+class FastLineSentence {
+public:
+    explicit FastLineSentence();
+	explicit FastLineSentence(const std::string& filename);
+
+	std::vector<std::string> ReadSentence();
+	inline bool IsEof() const { return is_eof_; }
+	inline void Reset() { fs_ = std::ifstream(filename_); is_eof_ = false; }
+
+private:
+    std::string filename_;
+	std::ifstream fs_;
+	bool is_eof_;
+};
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -142,6 +142,7 @@
 
 try:
     from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow
+    from gensim.models.word2vec_inner import train_epoch_sg, train_epoch_cbow
     from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow
     from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH
 
@@ -752,6 +753,18 @@ def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, wi
             seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
             fast_version=FAST_VERSION)
 
+    def _do_train_epoch(self, input_stream, thread_private_mem, cur_epoch, total_examples=None, total_words=None):
+        work, neu1 = thread_private_mem
+
+        if self.sg:
+            examples, tally, raw_tally = train_epoch_sg(self, input_stream, cur_epoch, total_examples, total_words,
+                                                        work, neu1, self.compute_loss)
+        else:
+            examples, tally, raw_tally = train_epoch_cbow(self, input_stream, cur_epoch, total_examples, total_words,
+                                                          work, neu1, self.compute_loss)
+
+        return examples, tally, raw_tally
+
     def _do_train_job(self, sentences, alpha, inits):
         """Train the model on a single batch of sentences.