Updated RandomState (deprecated from numpy) to default_rng (Generator)

This is regarding the issue piskvorky#2782 . Here are the benchmarks of before and after updating: Before Update After Update Poincare Ran 42 tests in 0.418s Ran 42 tests in 0.417s test_lda Ran 48 tests in 223.845s Ran 48 tests in 225.561s utils Ran 24 tests in 0.007s Ran 24 tests in 0.007s test_matutils Ran 18 tests in 0.071s Ran 18 tests in 0.070s word2vec Ran 79 tests in 58.149s Ran 79 tests in 57.950s I don't find a big difference in time taken. However I feel it is good to be updated along with numpy.
SagarDollin · Aug 28, 2021 · 8443280 · 8443280
1 parent c0f384c
commit 8443280
Show file tree

Hide file tree

Showing 8 changed files with 45 additions and 17 deletions.
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -1174,7 +1174,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
             num_topics = min(num_topics, self.num_topics)
 
             # add a little random jitter, to randomize results around the same alpha
-            sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
+            sort_alpha = self.alpha + 0.0001 * self.random_state.integers(low=0, high=1, size=len(self.alpha))
             # random_state.rand returns float64, but converting back to dtype won't speed up anything
 
             sorted_topics = list(matutils.argsort(sort_alpha))

diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py
@@ -164,7 +164,7 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil
         self._burn_in_done = False
         self.dtype = dtype
         self.seed = seed
-        self._np_random = np_random.RandomState(seed)
+        self._np_random = np_random.default_rng(seed)
         self.init_range = init_range
         self._loss_grad = None
         self.build_vocab(train_data)
@@ -264,7 +264,10 @@ def _get_candidate_negatives(self):
             # this is to avoid floating point errors that result when the number of nodes is very high
             # for reference: https://github.com/RaRe-Technologies/gensim/issues/1917
             max_cumsum_value = self._node_counts_cumsum[-1]
-            uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size)
+            if isinstance(self._np_random, np.random.Generator):
+                uniform_numbers = self._np_random.integers(1, max_cumsum_value + 1, self._negatives_buffer_size)
+            else:
+                uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size)
             cumsum_table_indices = np.searchsorted(self._node_counts_cumsum, uniform_numbers)
             self._negatives_buffer = NegativesBuffer(cumsum_table_indices)
         return self._negatives_buffer.get_items(self.negative)

diff --git a/gensim/models/test_poincare.py b/gensim/models/test_poincare.py
@@ -0,0 +1,14 @@
+from poincare import PoincareModel, PoincareRelations
+from time import time
+import numpy as np
+t1 = time()
+file_path = "C:\\Users\\sagar\\gensim\\gensim\\test\\test_data\\poincare_hypernyms_large.tsv"
+model = PoincareModel(PoincareRelations(file_path), negative=2)
+model.train(epochs=50)
+t2 = time()
+print(t2-t1)
+#print((np.random.randint.__doc__))
+
+
+print(np.random.RandomState.rand.__doc__)
+print(np.random.default_rng(1).gamma.__doc__)
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -384,7 +384,7 @@ def __init__(
 
         self.window = int(window)
         self.shrink_windows = bool(shrink_windows)
-        self.random = np.random.RandomState(seed)
+        self.random = np.random.default_rng(seed)
 
         self.hs = int(hs)
         self.negative = int(negative)

diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx
@@ -489,7 +489,10 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1
         c[0].cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
         c[0].cum_table_len = len(model.cum_table)
     if c[0].negative or c[0].sample:
-        c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)
+        if isinstance(model.random, np.random.Generator):
+            c[0].next_random = (2**24) * model.random.integers(0, 2**24) + model.random.integers(0, 2**24)
+        else:
+            c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)
 
     # convert Python structures to primitive types, so we can release the GIL
     c[0].work = <REAL_t *>np.PyArray_DATA(_work)
@@ -567,8 +570,12 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss):
 
     # precompute "reduced window" offsets in a single randint() call
     if model.shrink_windows:
-        for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
-            c.reduced_windows[i] = item
+        if isinstance(model.random, np.random.Generator):
+            for i, item in enumerate(model.random.integers(0, c.window, effective_words)):
+                c.reduced_windows[i] = item
+        else:
+            for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
+                c.reduced_windows[i] = item
     else:
         for i in range(effective_words):
             c.reduced_windows[i] = 0
@@ -667,8 +674,12 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):
 
     # precompute "reduced window" offsets in a single randint() call
     if model.shrink_windows:
-        for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
-            c.reduced_windows[i] = item
+        if isinstance(model.random, np.random.Generator):
+            for i, item in enumerate(model.random.integers(0, c.window, effective_words)):
+                c.reduced_windows[i] = item
+        else:
+            for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
+                c.reduced_windows[i] = item
     else:
         for i in range(effective_words):
             c.reduced_windows[i] = 0

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -31,9 +31,9 @@
 
 
 def test_random_state():
-    testcases = [np.random.seed(0), None, np.random.RandomState(0), 0]
+    testcases = [np.random.seed(0), None, np.random.default_rng(0), 0]
     for testcase in testcases:
-        assert(isinstance(utils.get_random_state(testcase), np.random.RandomState))
+        assert(isinstance(utils.get_random_state(testcase), np.random.Generator))
 
 
 class TestLdaModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
@@ -51,8 +51,8 @@ def test_sync_state(self):
         assert_allclose(self.model.get_topics(), model2.get_topics(), rtol=1e-5)
 
         # properly continues training on the new state
-        self.model.random_state = np.random.RandomState(0)
-        model2.random_state = np.random.RandomState(0)
+        self.model.random_state = np.random.default_rng(0)
+        model2.random_state = np.random.default_rng(0)
         self.model.passes = 1
         model2.passes = 1
         self.model.update(self.corpus)

diff --git a/gensim/test/test_matutils.py b/gensim/test/test_matutils.py
@@ -86,7 +86,7 @@ def dirichlet_expectation(alpha):
 
 class TestLdaModelInner(unittest.TestCase):
     def setUp(self):
-        self.random_state = np.random.RandomState()
+        self.random_state = np.random.default_rng()
         self.num_runs = 100  # test functions with *num_runs* random inputs
         self.num_topics = 100
 

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -86,10 +86,10 @@ def get_random_state(seed):
 
     """
     if seed is None or seed is np.random:
-        return np.random.mtrand._rand
+        return np.random.default_rng()
     if isinstance(seed, (numbers.Integral, np.integer)):
-        return np.random.RandomState(seed)
-    if isinstance(seed, np.random.RandomState):
+        return np.random.default_rng(seed)
+    if isinstance(seed, np.random.Generator):
         return seed
     raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)