Skip to content

Commit

Permalink
Updated RandomState (deprecated from numpy) to default_rng (Generator)
Browse files Browse the repository at this point in the history
This is regarding the issue piskvorky#2782 .

Here are the benchmarks of before and after updating:

		Before Update      		After Update

Poincare	Ran 42 tests in 0.418s          Ran 42 tests in 0.417s
test_lda        Ran 48 tests in 223.845s        Ran 48 tests in 225.561s
utils      	Ran 24 tests in 0.007s          Ran 24 tests in 0.007s
test_matutils   Ran 18 tests in 0.071s          Ran 18 tests in 0.070s
word2vec        Ran 79 tests in 58.149s         Ran 79 tests in 57.950s

I don't find a big difference in time taken. However I feel it is good to be updated along with numpy.
  • Loading branch information
SagarDollin committed Aug 28, 2021
1 parent c0f384c commit 8443280
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 17 deletions.
2 changes: 1 addition & 1 deletion gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
num_topics = min(num_topics, self.num_topics)

# add a little random jitter, to randomize results around the same alpha
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
sort_alpha = self.alpha + 0.0001 * self.random_state.integers(low=0, high=1, size=len(self.alpha))
# random_state.rand returns float64, but converting back to dtype won't speed up anything

sorted_topics = list(matutils.argsort(sort_alpha))
Expand Down
7 changes: 5 additions & 2 deletions gensim/models/poincare.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsil
self._burn_in_done = False
self.dtype = dtype
self.seed = seed
self._np_random = np_random.RandomState(seed)
self._np_random = np_random.default_rng(seed)
self.init_range = init_range
self._loss_grad = None
self.build_vocab(train_data)
Expand Down Expand Up @@ -264,7 +264,10 @@ def _get_candidate_negatives(self):
# this is to avoid floating point errors that result when the number of nodes is very high
# for reference: https://github.com/RaRe-Technologies/gensim/issues/1917
max_cumsum_value = self._node_counts_cumsum[-1]
uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size)
if isinstance(self._np_random, np.random.Generator):
uniform_numbers = self._np_random.integers(1, max_cumsum_value + 1, self._negatives_buffer_size)
else:
uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size)
cumsum_table_indices = np.searchsorted(self._node_counts_cumsum, uniform_numbers)
self._negatives_buffer = NegativesBuffer(cumsum_table_indices)
return self._negatives_buffer.get_items(self.negative)
Expand Down
14 changes: 14 additions & 0 deletions gensim/models/test_poincare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from poincare import PoincareModel, PoincareRelations
from time import time
import numpy as np
t1 = time()
file_path = "C:\\Users\\sagar\\gensim\\gensim\\test\\test_data\\poincare_hypernyms_large.tsv"
model = PoincareModel(PoincareRelations(file_path), negative=2)
model.train(epochs=50)
t2 = time()
print(t2-t1)
#print((np.random.randint.__doc__))


print(np.random.RandomState.rand.__doc__)
print(np.random.default_rng(1).gamma.__doc__)
2 changes: 1 addition & 1 deletion gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def __init__(

self.window = int(window)
self.shrink_windows = bool(shrink_windows)
self.random = np.random.RandomState(seed)
self.random = np.random.default_rng(seed)

self.hs = int(hs)
self.negative = int(negative)
Expand Down
21 changes: 16 additions & 5 deletions gensim/models/word2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,10 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1
c[0].cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
c[0].cum_table_len = len(model.cum_table)
if c[0].negative or c[0].sample:
c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)
if isinstance(model.random, np.random.Generator):
c[0].next_random = (2**24) * model.random.integers(0, 2**24) + model.random.integers(0, 2**24)
else:
c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)

# convert Python structures to primitive types, so we can release the GIL
c[0].work = <REAL_t *>np.PyArray_DATA(_work)
Expand Down Expand Up @@ -567,8 +570,12 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss):

# precompute "reduced window" offsets in a single randint() call
if model.shrink_windows:
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
c.reduced_windows[i] = item
if isinstance(model.random, np.random.Generator):
for i, item in enumerate(model.random.integers(0, c.window, effective_words)):
c.reduced_windows[i] = item
else:
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
c.reduced_windows[i] = item
else:
for i in range(effective_words):
c.reduced_windows[i] = 0
Expand Down Expand Up @@ -667,8 +674,12 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss):

# precompute "reduced window" offsets in a single randint() call
if model.shrink_windows:
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
c.reduced_windows[i] = item
if isinstance(model.random, np.random.Generator):
for i, item in enumerate(model.random.integers(0, c.window, effective_words)):
c.reduced_windows[i] = item
else:
for i, item in enumerate(model.random.randint(0, c.window, effective_words)):
c.reduced_windows[i] = item
else:
for i in range(effective_words):
c.reduced_windows[i] = 0
Expand Down
8 changes: 4 additions & 4 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@


def test_random_state():
testcases = [np.random.seed(0), None, np.random.RandomState(0), 0]
testcases = [np.random.seed(0), None, np.random.default_rng(0), 0]
for testcase in testcases:
assert(isinstance(utils.get_random_state(testcase), np.random.RandomState))
assert(isinstance(utils.get_random_state(testcase), np.random.Generator))


class TestLdaModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
Expand All @@ -51,8 +51,8 @@ def test_sync_state(self):
assert_allclose(self.model.get_topics(), model2.get_topics(), rtol=1e-5)

# properly continues training on the new state
self.model.random_state = np.random.RandomState(0)
model2.random_state = np.random.RandomState(0)
self.model.random_state = np.random.default_rng(0)
model2.random_state = np.random.default_rng(0)
self.model.passes = 1
model2.passes = 1
self.model.update(self.corpus)
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def dirichlet_expectation(alpha):

class TestLdaModelInner(unittest.TestCase):
def setUp(self):
self.random_state = np.random.RandomState()
self.random_state = np.random.default_rng()
self.num_runs = 100 # test functions with *num_runs* random inputs
self.num_topics = 100

Expand Down
6 changes: 3 additions & 3 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ def get_random_state(seed):
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
return np.random.default_rng()
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return np.random.default_rng(seed)
if isinstance(seed, np.random.Generator):
return seed
raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)

Expand Down

0 comments on commit 8443280

Please sign in to comment.