From 5586c661706d7dbbe9247f91c3adc8573e805ee6 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Fri, 8 Nov 2019 12:10:02 +0100
Subject: [PATCH 1/6] Speed up word2vec binary model loading (#2642)

---
 gensim/models/utils_any2vec.py | 107 +++++++++++++++++++++------------
 1 file changed, 67 insertions(+), 40 deletions(-)

diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
index 4f5396c853..2ef977fdb5 100644
--- a/gensim/models/utils_any2vec.py
+++ b/gensim/models/utils_any2vec.py
@@ -28,7 +28,7 @@
 import logging
 from gensim import utils
 
-from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, fromstring
+from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, frombuffer
 
 from six.moves import range
 from six import iteritems, PY2
@@ -147,7 +147,7 @@ def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, tota
 
 
 def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
-                          limit=None, datatype=REAL):
+                          limit=None, datatype=REAL, binary_chunk_size=100 * 1024):
     """Load the input-hidden weight matrix from the original C word2vec-tool format.
 
     Note that the information stored in the file is incomplete (the binary tree is missing),
@@ -176,6 +176,8 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
     datatype : type, optional
         (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.
         Such types may result in much slower bulk operations or incompatibility with optimized routines.)
+    binary_chunk_size : int, optional
+        Size of chunk in which binary files are read. Used mostly for testing. Defalut value 100 kB.
 
     Returns
     -------
@@ -183,7 +185,55 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
         Returns the loaded model as an instance of :class:`cls`.
 
     """
+
+    def __add_word_to_result(result, counts, word, weights):
+        word_id = len(result.vocab)
+        if word in result.vocab:
+            logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
+            return
+        if counts is None:
+            # most common scenario: no vocab file given. just make up some bogus counts, in descending order
+            result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
+        elif word in counts:
+            # use count from the vocab file
+            result.vocab[word] = Vocab(index=word_id, count=counts[word])
+        else:
+            # vocab file given, but word is missing -- set count to None (TODO: or raise?)
+            logger.warning("vocabulary file is incomplete: '%s' is missing", word)
+            result.vocab[word] = Vocab(index=word_id, count=None)
+        result.vectors[word_id] = weights
+        result.index2word.append(word)
+
+    def __remove_initial_new_line(s):
+        i = 0
+        while i < len(s) and s[i] == '\n':
+            i += 1
+        return s[i:]
+
+    def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, vector_size, datatype):
+        start = 0
+        n = len(chunk)
+        processed_words = 0
+        n_bytes_per_vector = vector_size * dtype(REAL).itemsize
+
+        for _ in range(0, max_words):
+            i_space = chunk.find(b' ', start)
+            i_vector = i_space + 1
+            if i_space != -1 and (n - i_vector) >= n_bytes_per_vector:
+                word = chunk[start:i_space].decode("utf-8", errors=unicode_errors)
+                # Some binary files are reported to have obsolete new line in the beginning of word, remove it
+                word = __remove_initial_new_line(word)
+                vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype)
+                __add_word_to_result(result, counts, word, vector)
+                start = i_vector + n_bytes_per_vector
+                processed_words += 1
+            else:
+                break
+
+        return processed_words, chunk[start:]
+
     from gensim.models.keyedvectors import Vocab
+
     counts = None
     if fvocab is not None:
         logger.info("loading word counts from %s", fvocab)
@@ -192,7 +242,6 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
             for line in fin:
                 word, count = utils.to_unicode(line, errors=unicode_errors).strip().split()
                 counts[word] = int(count)
-
     logger.info("loading projection weights from %s", fname)
     with utils.open(fname, 'rb') as fin:
         header = utils.to_unicode(fin.readline(), encoding=encoding)
@@ -202,43 +251,21 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
         result = cls(vector_size)
         result.vector_size = vector_size
         result.vectors = zeros((vocab_size, vector_size), dtype=datatype)
-
-        def add_word(word, weights):
-            word_id = len(result.vocab)
-            if word in result.vocab:
-                logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
-                return
-            if counts is None:
-                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
-                result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
-            elif word in counts:
-                # use count from the vocab file
-                result.vocab[word] = Vocab(index=word_id, count=counts[word])
-            else:
-                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
-                logger.warning("vocabulary file is incomplete: '%s' is missing", word)
-                result.vocab[word] = Vocab(index=word_id, count=None)
-            result.vectors[word_id] = weights
-            result.index2word.append(word)
-
         if binary:
-            binary_len = dtype(REAL).itemsize * vector_size
-            for _ in range(vocab_size):
-                # mixed text and binary: read text first, then binary
-                word = []
-                while True:
-                    ch = fin.read(1)  # Python uses I/O buffering internally
-                    if ch == b' ':
-                        break
-                    if ch == b'':
-                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
-                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
-                        word.append(ch)
-                word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
-                with utils.ignore_deprecation_warning():
-                    # TODO use frombuffer or something similar
-                    weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype)
-                add_word(word, weights)
+            chunk = b''
+            tot_processed_words = 0
+
+            while tot_processed_words < vocab_size:
+                new_chunk = fin.read(binary_chunk_size)
+                chunk += new_chunk
+                max_words = vocab_size - len(result.vocab)
+                processed_words, chunk = __add_words_from_binary_chunk_to_result(result, counts, max_words,
+                                                                chunk, vector_size, datatype)
+                tot_processed_words += processed_words
+                if len(new_chunk) < binary_chunk_size:
+                    break
+            if tot_processed_words != vocab_size:
+                raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
         else:
             for line_no in range(vocab_size):
                 line = fin.readline()
@@ -248,7 +275,7 @@ def add_word(word, weights):
                 if len(parts) != vector_size + 1:
                     raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
                 word, weights = parts[0], [datatype(x) for x in parts[1:]]
-                add_word(word, weights)
+                __add_word_to_result(result, counts, word, weights)
     if result.vectors.shape[0] != len(result.vocab):
         logger.info(
             "duplicate words detected, shrinking matrix size from %i to %i",

From 03ee3d0c4e198a66637a8ccfd08aeeed4fbb83ab Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Fri, 8 Nov 2019 12:10:29 +0100
Subject: [PATCH 2/6] Add correctness tests for optimized word2vec model
 loading (#2642)

---
 gensim/test/test_utils_any2vec.py | 122 ++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 gensim/test/test_utils_any2vec.py

diff --git a/gensim/test/test_utils_any2vec.py b/gensim/test/test_utils_any2vec.py
new file mode 100644
index 0000000000..f4c5c2c430
--- /dev/null
+++ b/gensim/test/test_utils_any2vec.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking utils_any2vec functionality.
+"""
+
+import logging
+import unittest
+
+import numpy as np
+
+import gensim.utils
+import gensim.test.utils
+
+import gensim.models.utils_any2vec
+
+
+logger = logging.getLogger(__name__)
+
+
+def save_dict_to_word2vec_formated_file(fname, word2vec_dict):
+
+    with gensim.utils.open(fname, "bw") as f:
+
+        num_words = len(word2vec_dict)
+        vector_length = len(list(word2vec_dict.values())[0])
+
+        header = "%d %d\n" % (num_words, vector_length)
+        f.write(header.encode(encoding="ascii"))
+
+        for word, vector in word2vec_dict.items():
+            f.write(word.encode())
+            f.write(' '.encode())
+            f.write(np.array(vector).astype(np.float32).tobytes())
+
+
+class LoadWord2VecFormatTest(unittest.TestCase):
+
+    def assert_dict_equal_to_model(self, d, m):
+        self.assertEqual(len(d), len(m.vocab))
+
+        for word in d.keys():
+            self.assertSequenceEqual(list(d[word]), list(m[word]))
+
+    def verify_load2vec_binary_result(self, w2v_dict, binary_chunk_size, limit):
+        tmpfile = gensim.test.utils.get_tmpfile("tmp_w2v")
+        save_dict_to_word2vec_formated_file(tmpfile, w2v_dict)
+        w2v_model = \
+            gensim.models.utils_any2vec._load_word2vec_format(
+                cls=gensim.models.KeyedVectors,
+                fname=tmpfile,
+                binary=True,
+                limit=limit,
+                binary_chunk_size=binary_chunk_size)
+        if limit is None:
+            limit = len(w2v_dict)
+
+        w2v_keys_postprocessed = list(w2v_dict.keys())[:limit]
+        w2v_dict_postprocessed = {k.lstrip(): w2v_dict[k] for k in w2v_keys_postprocessed}
+
+        self.assert_dict_equal_to_model(w2v_dict_postprocessed, w2v_model)
+
+    def test_load_word2vec_format_basic(self):
+        w2v_dict = {"abc": [1, 2, 3],
+                    "cde": [4, 5, 6],
+                    "def": [7, 8, 9]}
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=None)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=None)
+
+        w2v_dict = {"abc": [1, 2, 3],
+                    "cdefg": [4, 5, 6],
+                    "d": [7, 8, 9]}
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=None)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=None)
+
+    def test_load_word2vec_format_limit(self):
+        w2v_dict = {"abc": [1, 2, 3],
+                    "cde": [4, 5, 6],
+                    "def": [7, 8, 9]}
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=1)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=1)
+
+        w2v_dict = {"abc": [1, 2, 3],
+                    "cde": [4, 5, 6],
+                    "def": [7, 8, 9]}
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=2)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=2)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=2)
+
+        w2v_dict = {"abc": [1, 2, 3],
+                    "cdefg": [4, 5, 6],
+                    "d": [7, 8, 9]}
+
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=1)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=1)
+
+        w2v_dict = {"abc": [1, 2, 3],
+                    "cdefg": [4, 5, 6],
+                    "d": [7, 8, 9]}
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=2)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=16, limit=2)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=1024, limit=2)
+
+    def test_load_word2vec_format_space_stripping(self):
+        w2v_dict = {"\nabc": [1, 2, 3],
+                    "cdefdg": [4, 5, 6],
+                    "\n\ndef": [7, 8, 9]}
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=None)
+        self.verify_load2vec_binary_result(w2v_dict, binary_chunk_size=5, limit=1)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    unittest.main()

From 90c10892ffd3b26f9588ddfeef3a781a2dc0dbf4 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Thu, 14 Nov 2019 19:42:24 +0100
Subject: [PATCH 3/6] Include remarks of Radim to code speeding up vectors
 loading (#2671)

---
 gensim/models/utils_any2vec.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
index 2ef977fdb5..0162908fc8 100644
--- a/gensim/models/utils_any2vec.py
+++ b/gensim/models/utils_any2vec.py
@@ -177,7 +177,7 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
         (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.
         Such types may result in much slower bulk operations or incompatibility with optimized routines.)
     binary_chunk_size : int, optional
-        Size of chunk in which binary files are read. Used mostly for testing. Defalut value 100 kB.
+        Read input file in chunks of this many bytes for performance reasons.
 
     Returns
     -------
@@ -204,12 +204,6 @@ def __add_word_to_result(result, counts, word, weights):
         result.vectors[word_id] = weights
         result.index2word.append(word)
 
-    def __remove_initial_new_line(s):
-        i = 0
-        while i < len(s) and s[i] == '\n':
-            i += 1
-        return s[i:]
-
     def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, vector_size, datatype):
         start = 0
         n = len(chunk)
@@ -222,7 +216,7 @@ def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, ve
             if i_space != -1 and (n - i_vector) >= n_bytes_per_vector:
                 word = chunk[start:i_space].decode("utf-8", errors=unicode_errors)
                 # Some binary files are reported to have obsolete new line in the beginning of word, remove it
-                word = __remove_initial_new_line(word)
+                word = word.lstrip('\n')
                 vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype)
                 __add_word_to_result(result, counts, word, vector)
                 start = i_vector + n_bytes_per_vector
@@ -242,6 +236,7 @@ def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, ve
             for line in fin:
                 word, count = utils.to_unicode(line, errors=unicode_errors).strip().split()
                 counts[word] = int(count)
+
     logger.info("loading projection weights from %s", fname)
     with utils.open(fname, 'rb') as fin:
         header = utils.to_unicode(fin.readline(), encoding=encoding)
@@ -259,8 +254,8 @@ def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, ve
                 new_chunk = fin.read(binary_chunk_size)
                 chunk += new_chunk
                 max_words = vocab_size - len(result.vocab)
-                processed_words, chunk = __add_words_from_binary_chunk_to_result(result, counts, max_words,
-                                                                chunk, vector_size, datatype)
+                processed_words, chunk = __add_words_from_binary_chunk_to_result(
+                    result, counts, max_words, chunk, vector_size, datatype)
                 tot_processed_words += processed_words
                 if len(new_chunk) < binary_chunk_size:
                     break

From b5930a9edfd9537f5e52d2b697a821bc8007c611 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Thu, 14 Nov 2019 21:27:55 +0100
Subject: [PATCH 4/6] Include remarks of Michael to code speeding up vectors
 loading (#2671)

---
 gensim/models/utils_any2vec.py | 43 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
index 0162908fc8..0be4131bdb 100644
--- a/gensim/models/utils_any2vec.py
+++ b/gensim/models/utils_any2vec.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # Author: Shiva Manne <s.manne@rare-technologies.com>
-# Copyright (C) 2018 RaRe Technologies s.r.o.
+# Copyright (C) 2019 RaRe Technologies s.r.o.
 
 """General functions used for any2vec models.
 
@@ -186,7 +186,8 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
 
     """
 
-    def __add_word_to_result(result, counts, word, weights):
+    def _add_word_to_result(result, counts, word, weights, vocab_size):
+        from gensim.models.keyedvectors import Vocab
         word_id = len(result.vocab)
         if word in result.vocab:
             logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
@@ -204,29 +205,27 @@ def __add_word_to_result(result, counts, word, weights):
         result.vectors[word_id] = weights
         result.index2word.append(word)
 
-    def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, vector_size, datatype):
+    def _add_bytes_to_result(result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors):
         start = 0
-        n = len(chunk)
         processed_words = 0
-        n_bytes_per_vector = vector_size * dtype(REAL).itemsize
-
-        for _ in range(0, max_words):
+        bytes_per_vector = vector_size * dtype(REAL).itemsize
+        max_words = vocab_size - len(result.vocab)
+        for _ in range(max_words):
             i_space = chunk.find(b' ', start)
             i_vector = i_space + 1
-            if i_space != -1 and (n - i_vector) >= n_bytes_per_vector:
-                word = chunk[start:i_space].decode("utf-8", errors=unicode_errors)
-                # Some binary files are reported to have obsolete new line in the beginning of word, remove it
-                word = word.lstrip('\n')
-                vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype)
-                __add_word_to_result(result, counts, word, vector)
-                start = i_vector + n_bytes_per_vector
-                processed_words += 1
-            else:
+
+            if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector:
                 break
 
-        return processed_words, chunk[start:]
+            word = chunk[start:i_space].decode("utf-8", errors=unicode_errors)
+            # Some binary files are reported to have obsolete new line in the beginning of word, remove it
+            word = word.lstrip('\n')
+            vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype)
+            _add_word_to_result(result, counts, word, vector, vocab_size)
+            start = i_vector + bytes_per_vector
+            processed_words += 1
 
-    from gensim.models.keyedvectors import Vocab
+        return processed_words, chunk[start:]
 
     counts = None
     if fvocab is not None:
@@ -246,6 +245,7 @@ def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, ve
         result = cls(vector_size)
         result.vector_size = vector_size
         result.vectors = zeros((vocab_size, vector_size), dtype=datatype)
+
         if binary:
             chunk = b''
             tot_processed_words = 0
@@ -253,9 +253,8 @@ def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, ve
             while tot_processed_words < vocab_size:
                 new_chunk = fin.read(binary_chunk_size)
                 chunk += new_chunk
-                max_words = vocab_size - len(result.vocab)
-                processed_words, chunk = __add_words_from_binary_chunk_to_result(
-                    result, counts, max_words, chunk, vector_size, datatype)
+                processed_words, chunk = _add_bytes_to_result(
+                    result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors)
                 tot_processed_words += processed_words
                 if len(new_chunk) < binary_chunk_size:
                     break
@@ -270,7 +269,7 @@ def __add_words_from_binary_chunk_to_result(result, counts, max_words, chunk, ve
                 if len(parts) != vector_size + 1:
                     raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
                 word, weights = parts[0], [datatype(x) for x in parts[1:]]
-                __add_word_to_result(result, counts, word, weights)
+                _add_word_to_result(result, counts, word, weights, vocab_size)
     if result.vectors.shape[0] != len(result.vocab):
         logger.info(
             "duplicate words detected, shrinking matrix size from %i to %i",

From c6e3fb2702c0b3152d238f00d95bfc1076bf84ef Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Thu, 14 Nov 2019 21:50:16 +0100
Subject: [PATCH 5/6] Refactor _load_word2vec_format into a few functions for
 better readability

---
 gensim/models/utils_any2vec.py | 139 ++++++++++++++++++---------------
 1 file changed, 76 insertions(+), 63 deletions(-)

diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
index 0be4131bdb..3dda5e6617 100644
--- a/gensim/models/utils_any2vec.py
+++ b/gensim/models/utils_any2vec.py
@@ -146,6 +146,79 @@ def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, tota
                 fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
 
 
+# Functions for internal use by _load_word2vec_format function
+
+def _add_word_to_result(result, counts, word, weights, vocab_size):
+    from gensim.models.keyedvectors import Vocab
+    word_id = len(result.vocab)
+    if word in result.vocab:
+        logger.warning("duplicate word '%s' in word2vec file, ignoring all but first", word)
+        return
+    if counts is None:
+        # most common scenario: no vocab file given. just make up some bogus counts, in descending order
+        result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
+    elif word in counts:
+        # use count from the vocab file
+        result.vocab[word] = Vocab(index=word_id, count=counts[word])
+    else:
+        # vocab file given, but word is missing -- set count to None (TODO: or raise?)
+        logger.warning("vocabulary file is incomplete: '%s' is missing", word)
+        result.vocab[word] = Vocab(index=word_id, count=None)
+    result.vectors[word_id] = weights
+    result.index2word.append(word)
+
+
+def _add_bytes_to_result(result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors):
+    start = 0
+    processed_words = 0
+    bytes_per_vector = vector_size * dtype(REAL).itemsize
+    max_words = vocab_size - len(result.vocab)
+    for _ in range(max_words):
+        i_space = chunk.find(b' ', start)
+        i_vector = i_space + 1
+
+        if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector:
+            break
+
+        word = chunk[start:i_space].decode("utf-8", errors=unicode_errors)
+        # Some binary files are reported to have obsolete new line in the beginning of word, remove it
+        word = word.lstrip('\n')
+        vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype)
+        _add_word_to_result(result, counts, word, vector, vocab_size)
+        start = i_vector + bytes_per_vector
+        processed_words += 1
+
+    return processed_words, chunk[start:]
+
+
+def _word2vec_read_binary(fin, result, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size):
+    chunk = b''
+    tot_processed_words = 0
+
+    while tot_processed_words < vocab_size:
+        new_chunk = fin.read(binary_chunk_size)
+        chunk += new_chunk
+        processed_words, chunk = _add_bytes_to_result(
+            result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors)
+        tot_processed_words += processed_words
+        if len(new_chunk) < binary_chunk_size:
+            break
+    if tot_processed_words != vocab_size:
+        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
+
+
+def _word2vec_read_text(fin, result, counts, vocab_size, vector_size, datatype, unicode_errors, encoding):
+    for line_no in range(vocab_size):
+        line = fin.readline()
+        if line == b'':
+            raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
+        parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
+        if len(parts) != vector_size + 1:
+            raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
+        word, weights = parts[0], [datatype(x) for x in parts[1:]]
+        _add_word_to_result(result, counts, word, weights, vocab_size)
+
+
 def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
                           limit=None, datatype=REAL, binary_chunk_size=100 * 1024):
     """Load the input-hidden weight matrix from the original C word2vec-tool format.
@@ -186,47 +259,6 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
 
     """
 
-    def _add_word_to_result(result, counts, word, weights, vocab_size):
-        from gensim.models.keyedvectors import Vocab
-        word_id = len(result.vocab)
-        if word in result.vocab:
-            logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
-            return
-        if counts is None:
-            # most common scenario: no vocab file given. just make up some bogus counts, in descending order
-            result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
-        elif word in counts:
-            # use count from the vocab file
-            result.vocab[word] = Vocab(index=word_id, count=counts[word])
-        else:
-            # vocab file given, but word is missing -- set count to None (TODO: or raise?)
-            logger.warning("vocabulary file is incomplete: '%s' is missing", word)
-            result.vocab[word] = Vocab(index=word_id, count=None)
-        result.vectors[word_id] = weights
-        result.index2word.append(word)
-
-    def _add_bytes_to_result(result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors):
-        start = 0
-        processed_words = 0
-        bytes_per_vector = vector_size * dtype(REAL).itemsize
-        max_words = vocab_size - len(result.vocab)
-        for _ in range(max_words):
-            i_space = chunk.find(b' ', start)
-            i_vector = i_space + 1
-
-            if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector:
-                break
-
-            word = chunk[start:i_space].decode("utf-8", errors=unicode_errors)
-            # Some binary files are reported to have obsolete new line in the beginning of word, remove it
-            word = word.lstrip('\n')
-            vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype)
-            _add_word_to_result(result, counts, word, vector, vocab_size)
-            start = i_vector + bytes_per_vector
-            processed_words += 1
-
-        return processed_words, chunk[start:]
-
     counts = None
     if fvocab is not None:
         logger.info("loading word counts from %s", fvocab)
@@ -247,29 +279,10 @@ def _add_bytes_to_result(result, counts, chunk, vocab_size, vector_size, datatyp
         result.vectors = zeros((vocab_size, vector_size), dtype=datatype)
 
         if binary:
-            chunk = b''
-            tot_processed_words = 0
-
-            while tot_processed_words < vocab_size:
-                new_chunk = fin.read(binary_chunk_size)
-                chunk += new_chunk
-                processed_words, chunk = _add_bytes_to_result(
-                    result, counts, chunk, vocab_size, vector_size, datatype, unicode_errors)
-                tot_processed_words += processed_words
-                if len(new_chunk) < binary_chunk_size:
-                    break
-            if tot_processed_words != vocab_size:
-                raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
+            _word2vec_read_binary(fin, result, counts,
+                vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size)
         else:
-            for line_no in range(vocab_size):
-                line = fin.readline()
-                if line == b'':
-                    raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
-                parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
-                if len(parts) != vector_size + 1:
-                    raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)
-                word, weights = parts[0], [datatype(x) for x in parts[1:]]
-                _add_word_to_result(result, counts, word, weights, vocab_size)
+            _word2vec_read_text(fin, result, counts, vocab_size, vector_size, datatype, unicode_errors, encoding)
     if result.vectors.shape[0] != len(result.vocab):
         logger.info(
             "duplicate words detected, shrinking matrix size from %i to %i",

From 1f230a20925e877a7d0bef5dafe8cbb3b99b39dd Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Thu, 14 Nov 2019 22:17:25 +0100
Subject: [PATCH 6/6] Clean-up _add_word_to_result function

---
 gensim/models/utils_any2vec.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
index 3dda5e6617..563f26b8f5 100644
--- a/gensim/models/utils_any2vec.py
+++ b/gensim/models/utils_any2vec.py
@@ -148,6 +148,7 @@ def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, tota
 
 # Functions for internal use by _load_word2vec_format function
 
+
 def _add_word_to_result(result, counts, word, weights, vocab_size):
     from gensim.models.keyedvectors import Vocab
     word_id = len(result.vocab)
@@ -156,14 +157,15 @@ def _add_word_to_result(result, counts, word, weights, vocab_size):
         return
     if counts is None:
         # most common scenario: no vocab file given. just make up some bogus counts, in descending order
-        result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
+        word_count = vocab_size - word_id
     elif word in counts:
         # use count from the vocab file
-        result.vocab[word] = Vocab(index=word_id, count=counts[word])
+        word_count = counts[word]
     else:
-        # vocab file given, but word is missing -- set count to None (TODO: or raise?)
         logger.warning("vocabulary file is incomplete: '%s' is missing", word)
-        result.vocab[word] = Vocab(index=word_id, count=None)
+        word_count = None
+
+    result.vocab[word] = Vocab(index=word_id, count=word_count)
     result.vectors[word_id] = weights
     result.index2word.append(word)