diff --git a/docs/notebooks/keras_wrapper.ipynb b/docs/notebooks/keras_wrapper.ipynb index 05b71e4594..88d24af681 100644 --- a/docs/notebooks/keras_wrapper.ipynb +++ b/docs/notebooks/keras_wrapper.ipynb @@ -38,9 +38,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "scrolled": true }, "outputs": [ { @@ -59,12 +60,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next we create a dummy set of sentences to train the Word2Vec model associated with the wrapper." + "Next we create a dummy set of sentences to train our Word2Vec model." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -87,12 +88,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we call the wrapper and pass appropriate parameters." + "Then, we create the Word2Vec model by passing appropriate parameters." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -109,42 +110,6 @@ "model = word2vec.Word2Vec(sentences, size=100, min_count=1, hs=1)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use methods and atributes associated with the Word2Vec model on the model returned by the wrapper." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('human', 0.21846069395542145), ('eps', 0.14406153559684753), ('system', 0.12887781858444214), ('time', 0.12749385833740234), ('computer', 0.10715052485466003), ('minors', 0.08211944997310638), ('user', 0.031229231506586075), ('interface', 0.016254140064120293), ('trees', 0.005966894328594208), ('survey', -0.10215148329734802)]\n" - ] - } - ], - "source": [ - "sims = model.most_similar('graph', topn=10) #words most similar to 'graph'\n", - "print sims" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "As with Word2Vec models, the results obtained after training on small input can be unexpected. " - ] - }, { "cell_type": "markdown", "metadata": { @@ -158,12 +123,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As an example of using the wrapper with Keras, we try to use the wrapper for word similarity task where we compute the cosine distance as a measure of similarity between the two words." + "As an example of integration of Gensim's Word2Vec model with Keras, we consider a word similarity task where we compute the cosine distance as a measure of similarity between the two words." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -184,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -203,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -236,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": { "collapsed": false }, @@ -252,7 +217,9 @@ "source": [ "word_a = 'graph'\n", "word_b = 'trees'\n", - "output = keras_model.predict([np.asarray([model.wv.vocab[word_a].index]), np.asarray([model.wv.vocab[word_b].index])]) # output is the cosine distance between the two words (as a similarity measure)\n", + "# output is the cosine distance between the two words (as a similarity measure)\n", + "output = keras_model.predict([np.asarray([model.wv.vocab[word_a].index]), np.asarray([model.wv.vocab[word_b].index])])\n", + "\n", "print output" ] }, @@ -269,12 +236,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To see how this wrapper could be used while dealing with a real supervised (classification) task, we consider the [20NewsGroups](qwone.com/~jason/20Newsgroups/) task. Here, we take a smaller version of this data by taking a subset of the documents to be classified. First, we import the necessary modules." + "To see how Gensim's Word2Vec model could be integrated with Keras while dealing with a real supervised (classification) task, we consider the [20NewsGroups](qwone.com/~jason/20Newsgroups/) task. Here, we take a smaller version of this data by taking a subset of the documents to be classified. \n", + "\n", + "First, we import the necessary modules." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "collapsed": false }, @@ -306,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -346,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -371,12 +340,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As the next step, we prepare the embedding layer for which we use the wrapper as follows." + "As the next step, we prepare the embedding layer to be used in our actual Keras model." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -406,7 +375,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -416,24 +385,24 @@ "output_type": "stream", "text": [ "Epoch 1/5\n", - "137/137 [==============================] - 3s - loss: 1.0396 - acc: 0.4526 \n", + "137/137 [==============================] - 2s - loss: 1.0051 - acc: 0.4088 \n", "Epoch 2/5\n", - "137/137 [==============================] - 2s - loss: 0.8995 - acc: 0.4161 \n", + "137/137 [==============================] - 2s - loss: 0.9640 - acc: 0.4891 \n", "Epoch 3/5\n", - "137/137 [==============================] - 2s - loss: 0.9866 - acc: 0.4526 \n", + "137/137 [==============================] - 2s - loss: 0.8881 - acc: 0.4891 \n", "Epoch 4/5\n", - "137/137 [==============================] - 2s - loss: 0.8957 - acc: 0.4891 \n", + "137/137 [==============================] - 2s - loss: 0.9136 - acc: 0.4453 \n", "Epoch 5/5\n", - "137/137 [==============================] - 2s - loss: 0.9002 - acc: 0.4891 \n" + "137/137 [==============================] - 2s - loss: 0.8823 - acc: 0.4891 \n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -482,7 +451,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -498,8 +467,7 @@ "from gensim.models import keyedvectors\n", "from collections import defaultdict\n", "\n", - "import pandas as pd\n", - "import spacy" + "import pandas as pd" ] }, { @@ -511,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 21, "metadata": { "collapsed": true }, @@ -519,15 +487,15 @@ "source": [ "# global variables\n", "\n", - "nb_filters=1200 # number of filters\n", - "n_gram=2 # n-gram, or window size of CNN/ConvNet\n", - "maxlen=15 # maximum number of words in a sentence\n", - "vecsize=300 # length of the embedded vectors in the model \n", - "cnn_dropout=0.0 # dropout rate for CNN/ConvNet\n", - "final_activation='softmax' # activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear.\n", - "dense_wl2reg=0.0 # dense_wl2reg: L2 regularization coefficient\n", - "dense_bl2reg=0.0 # dense_bl2reg: L2 regularization coefficient for bias\n", - "optimizer='adam' # optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam\n", + "nb_filters = 1200 # number of filters\n", + "n_gram = 2 # n-gram, or window size of CNN/ConvNet\n", + "maxlen = 15 # maximum number of words in a sentence\n", + "vecsize = 300 # length of the embedded vectors in the model \n", + "cnn_dropout = 0.0 # dropout rate for CNN/ConvNet\n", + "final_activation = 'softmax' # activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear.\n", + "dense_wl2reg = 0.0 # dense_wl2reg: L2 regularization coefficient\n", + "dense_bl2reg = 0.0 # dense_bl2reg: L2 regularization coefficient for bias\n", + "optimizer = 'adam' # optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam\n", "\n", "# utility functions\n", "\n", @@ -539,18 +507,18 @@ " \"\"\"\n", " df = pd.read_csv(filepath)\n", " category_col, descp_col = df.columns.values.tolist()\n", - " shorttextdict = defaultdict(lambda : [])\n", + " shorttextdict = dict()\n", " for category, descp in zip(df[category_col], df[descp_col]):\n", - " if type(descp)==str:\n", - " shorttextdict[category] += [descp]\n", - " return dict(shorttextdict)\n", + " if type(descp) == str:\n", + " shorttextdict.setdefault(category, []).append(descp)\n", + " return shorttextdict\n", "\n", "def subjectkeywords():\n", " \"\"\"\n", " Return an example data set, with three subjects and corresponding keywords.\n", " This is in the format of the training input.\n", " \"\"\"\n", - " data_path = './datasets/keras_classifier_training_data.csv'\n", + " data_path = os.path.join(os.getcwd(), 'datasets/keras_classifier_training_data.csv')\n", " return retrieve_csvdata_as_dict(data_path)\n", "\n", "def convert_trainingdata(classdict):\n", @@ -565,7 +533,7 @@ " indices = []\n", " for label in classlabels:\n", " for shorttext in classdict[label]:\n", - " shorttext = shorttext if type(shorttext)==str else ''\n", + " shorttext = shorttext if type(shorttext) == str else ''\n", " category_bucket = [0]*len(classlabels)\n", " category_bucket[lblidx_dict[label]] = 1\n", " indices.append(category_bucket)\n", @@ -594,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -609,7 +577,7 @@ ], "source": [ "# we are training our Word2Vec model here\n", - "w2v_training_data_path = './datasets/word_vectors_training_data.txt'\n", + "w2v_training_data_path = os.path.join(os.getcwd(), 'datasets/word_vectors_training_data.txt')\n", "input_data = word2vec.LineSentence(w2v_training_data_path)\n", "w2v_model = word2vec.Word2Vec(input_data, size=300)\n", "w2v_model_wv = w2v_model.wv\n", @@ -628,7 +596,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 23, "metadata": { "collapsed": false }, @@ -648,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 24, "metadata": { "collapsed": false }, @@ -675,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 25, "metadata": { "collapsed": false }, @@ -685,25 +653,25 @@ "output_type": "stream", "text": [ "Epoch 1/10\n", - "45/45 [==============================] - 0s - loss: 1.1000 - acc: 0.4000 \n", + "45/45 [==============================] - 0s - loss: 1.1154 - acc: 0.2222 \n", "Epoch 2/10\n", - "45/45 [==============================] - 0s - loss: 1.0842 - acc: 0.3556 \n", + "45/45 [==============================] - 0s - loss: 1.0949 - acc: 0.3333 \n", "Epoch 3/10\n", - "45/45 [==============================] - 0s - loss: 0.9784 - acc: 0.9111 \n", + "45/45 [==============================] - 0s - loss: 1.0426 - acc: 0.8667 \n", "Epoch 4/10\n", - "45/45 [==============================] - 0s - loss: 0.7570 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 0.8931 - acc: 0.9556 \n", "Epoch 5/10\n", - "45/45 [==============================] - 0s - loss: 0.5210 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 0.6967 - acc: 0.9778 \n", "Epoch 6/10\n", - "45/45 [==============================] - 0s - loss: 0.3203 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 0.4727 - acc: 0.9556 \n", "Epoch 7/10\n", - "45/45 [==============================] - 0s - loss: 0.1927 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 0.2991 - acc: 0.9778 \n", "Epoch 8/10\n", - "45/45 [==============================] - 0s - loss: 0.1242 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 0.1795 - acc: 0.9778 \n", "Epoch 9/10\n", - "45/45 [==============================] - 0s - loss: 0.0989 - acc: 0.9556 \n", + "45/45 [==============================] - 0s - loss: 0.1218 - acc: 0.9778 \n", "Epoch 10/10\n", - "45/45 [==============================] - 0s - loss: 0.0810 - acc: 0.9778 \n" + "45/45 [==============================] - 0s - loss: 0.0889 - acc: 0.9778 \n" ] } ], @@ -730,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "metadata": { "collapsed": false }, @@ -739,7 +707,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'mathematics': 0.98286772, 'physics': 0.0081670163, 'theology': 0.008965265}\n" + "{'mathematics': 0.96289372, 'physics': 0.025273025, 'theology': 0.011833278}\n" ] } ], @@ -762,7 +730,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The result above clearly suggests (~ 98% probability!) that the input `artificial intellegence` should belong to the category `mathematics`, which conforms very well with the expected output in this case.\n", + "The result above clearly suggests (~ 98% probability!) that the input `artificial intelligence` should belong to the category `mathematics`, which conforms very well with the expected output in this case.\n", "In general, the output could depend on several factors including the number of filters for the conv-net, the training data for the word-vectors, the training data for the classifier etc." ] }, diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index b59d6612c6..ced285c77c 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -79,7 +79,9 @@ def testEmbeddingLayerCosineSim(self): word_a = 'graph' word_b = 'trees' - output = model.predict([np.asarray([keras_w2v_model.wv.vocab[word_a].index]), np.asarray([keras_w2v_model.wv.vocab[word_b].index])]) # output is the cosine distance between the two words (as a similarity measure) + output = model.predict([np.asarray([keras_w2v_model.wv.vocab[word_a].index]), np.asarray([keras_w2v_model.wv.vocab[word_b].index])]) + # output is the cosine distance between the two words (as a similarity measure) + self.assertTrue(type(output[0][0][0]) == np.float32) # verify that a float is returned def testEmbeddingLayer20NewsGroup(self):