piskvorky · menshikh-iv · Oct 25, 2017 · Oct 25, 2017 · Oct 25, 2017 · Oct 25, 2017
diff --git a/docs/notebooks/annoytutorial.ipynb b/docs/notebooks/annoytutorial.ipynb
@@ -41,21 +41,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPython 3.6.0\n",
-      "IPython 6.0.0\n",
+      "CPython 3.5.3\n",
+      "IPython 6.2.1\n",
       "\n",
-      "gensim 2.1.0\n",
-      "numpy 1.12.1\n",
-      "scipy 0.19.0\n",
-      "psutil 5.2.2\n",
-      "matplotlib 2.0.0\n",
+      "gensim 3.0.1\n",
+      "numpy 1.13.3\n",
+      "scipy 1.0.0\n",
+      "psutil 5.4.0\n",
+      "matplotlib 2.1.0\n",
       "\n",
-      "compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)\n",
+      "compiler   : GCC 6.3.0 20170406\n",
       "system     : Linux\n",
-      "release    : 4.9.27-moby\n",
+      "release    : 4.10.0-37-generic\n",
       "machine    : x86_64\n",
       "processor  : x86_64\n",
-      "CPU cores  : 4\n",
+      "CPU cores  : 8\n",
       "interpreter: 64bit\n"
      ]
     }
@@ -76,9 +76,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import os.path\n",
@@ -98,9 +96,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "LOGS = False\n",
@@ -136,28 +132,18 @@
     "from gensim.models import Word2Vec, KeyedVectors\n",
     "from gensim.models.word2vec import Text8Corpus\n",
     "\n",
-    "# using params from Word2Vec_FastText_Comparison\n",
-    "\n",
-    "lr = 0.05\n",
-    "dim = 100\n",
-    "ws = 5\n",
-    "epoch = 5\n",
-    "minCount = 5\n",
-    "neg = 5\n",
-    "loss = 'ns'\n",
-    "t = 1e-4\n",
+    "# Using params from Word2Vec_FastText_Comparison\n",
     "\n",
-    "# Same values as used for fastText training above\n",
     "params = {\n",
-    "    'alpha': lr,\n",
-    "    'size': dim,\n",
-    "    'window': ws,\n",
-    "    'iter': epoch,\n",
-    "    'min_count': minCount,\n",
-    "    'sample': t,\n",
+    "    'alpha': 0.05,\n",
+    "    'size': 100,\n",
+    "    'window': 5,\n",
+    "    'iter': 5,\n",
+    "    'min_count': 5,\n",
+    "    'sample': 1e-4,\n",
     "    'sg': 1,\n",
     "    'hs': 0,\n",
-    "    'negative': neg\n",
+    "    'negative': 5\n",
     "}\n",
     "\n",
     "model = Word2Vec(Text8Corpus('text8'), **params)\n",
@@ -181,16 +167,11 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "#Set up the model and vector that we are using in the comparison\n",
-    "try:\n",
-    "    from gensim.similarities.index import AnnoyIndexer\n",
-    "except ImportError:\n",
-    "    raise ValueError(\"SKIP: Please install the annoy indexer\")\n",
+    "# Set up the model and vector that we are using in the comparison\n",
+    "from gensim.similarities.index import AnnoyIndexer\n",
     "\n",
     "model.init_sims()\n",
     "annoy_index = AnnoyIndexer(model, 100)"
@@ -204,11 +185,11 @@
     {
      "data": {
       "text/plain": [
-       "[('the', 1.0000001192092896),\n",
-       " ('of', 0.8333191275596619),\n",
-       " ('in', 0.8258030414581299),\n",
-       " ('a', 0.7722446918487549),\n",
-       " ('and', 0.7408151626586914)]"
+       "[('the', 0.9999999403953552),\n",
+       " ('of', 0.8254586458206177),\n",
+       " ('in', 0.8207480907440186),\n",
+       " ('a', 0.7935141324996948),\n",
+       " ('and', 0.7539303302764893)]"
       ]
      },
      "execution_count": 6,
@@ -226,9 +207,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import time\n",
@@ -238,9 +217,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def avg_query_time(annoy_index=None, queries=1000):\n",
@@ -266,10 +243,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Gensim (s/query):\t0.01409\n",
-      "Annoy (s/query):\t0.00031\n",
+      "Gensim (s/query):\t0.02066\n",
+      "Annoy (s/query):\t0.00038\n",
       "\n",
-      "Annoy is 44.94 times faster on average on this particular run\n"
+      "Annoy is 54.59 times faster on average on this particular run\n"
      ]
     }
    ],
@@ -329,30 +306,30 @@
      "output_type": "stream",
      "text": [
       "Approximate Neighbors\n",
-      "('science', 0.9998273665114539)\n",
-      "('multidisciplinary', 0.6123671233654022)\n",
-      "('sciences', 0.6045806407928467)\n",
-      "('astrobiology', 0.5991603136062622)\n",
-      "('aaas', 0.5971885621547699)\n",
-      "('bimonthly', 0.5882039070129395)\n",
-      "('interdisciplinary', 0.5875678360462189)\n",
-      "('psychohistory', 0.5828642845153809)\n",
-      "('protoscience', 0.5820913016796112)\n",
-      "('scientific', 0.5779787003993988)\n",
-      "('transhumanism', 0.5754979848861694)\n",
+      "('science', 1.0)\n",
+      "('multidisciplinary', 0.6066591441631317)\n",
+      "('astrobiology', 0.5995452105998993)\n",
+      "('actuarial', 0.5984143614768982)\n",
+      "('robotics', 0.5919757187366486)\n",
+      "('sciences', 0.5884003043174744)\n",
+      "('scientific', 0.5805909633636475)\n",
+      "('interdisciplinary', 0.5763890445232391)\n",
+      "('astronautics', 0.5748652517795563)\n",
+      "('psychohistory', 0.5744689702987671)\n",
+      "('aaas', 0.574154257774353)\n",
       "\n",
       "Normal (not Annoy-indexed) Neighbors\n",
-      "('science', 0.9999998807907104)\n",
-      "('fiction', 0.7650254964828491)\n",
-      "('multidisciplinary', 0.6994814872741699)\n",
-      "('sciences', 0.6872870922088623)\n",
-      "('astrobiology', 0.6786551475524902)\n",
-      "('aaas', 0.6754858493804932)\n",
-      "('technology', 0.6748392581939697)\n",
-      "('bimonthly', 0.6608479619026184)\n",
-      "('interdisciplinary', 0.6597993969917297)\n",
-      "('astronautics', 0.6552520990371704)\n",
-      "('psychohistory', 0.6519955396652222)\n"
+      "('science', 1.0)\n",
+      "('fiction', 0.7570418119430542)\n",
+      "('multidisciplinary', 0.6905661225318909)\n",
+      "('astrobiology', 0.6792721152305603)\n",
+      "('actuarial', 0.6774581670761108)\n",
+      "('robotics', 0.6670321822166443)\n",
+      "('vinge', 0.6633784770965576)\n",
+      "('sciences', 0.6611713767051697)\n",
+      "('vernor', 0.6521490812301636)\n",
+      "('popularizer', 0.6499912738800049)\n",
+      "('scientific', 0.648192286491394)\n"
      ]
     }
    ],
@@ -406,9 +383,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "fname = '/tmp/mymodel.index'\n",
@@ -432,17 +407,17 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "('science', 0.9998273665114539)\n",
-      "('multidisciplinary', 0.6123671233654022)\n",
-      "('sciences', 0.6045806407928467)\n",
-      "('astrobiology', 0.5991603136062622)\n",
-      "('aaas', 0.5971885621547699)\n",
-      "('bimonthly', 0.5882039070129395)\n",
-      "('interdisciplinary', 0.5875678360462189)\n",
-      "('psychohistory', 0.5828642845153809)\n",
-      "('protoscience', 0.5820913016796112)\n",
-      "('scientific', 0.5779787003993988)\n",
-      "('transhumanism', 0.5754979848861694)\n"
+      "('science', 1.0)\n",
+      "('multidisciplinary', 0.6066591441631317)\n",
+      "('astrobiology', 0.5995452105998993)\n",
+      "('actuarial', 0.5984143614768982)\n",
+      "('robotics', 0.5919757187366486)\n",
+      "('sciences', 0.5884003043174744)\n",
+      "('scientific', 0.5805909633636475)\n",
+      "('interdisciplinary', 0.5763890445232391)\n",
+      "('astronautics', 0.5748652517795563)\n",
+      "('psychohistory', 0.5744689702987671)\n",
+      "('aaas', 0.574154257774353)\n"
      ]
     }
    ],
@@ -477,9 +452,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Remove verbosity from code below (if logging active)\n",
@@ -491,9 +464,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from multiprocessing import Process\n",
@@ -517,16 +488,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Process Id:  311\n",
+      "Process Id: 18708\n",
       "\n",
-      "Memory used by process 311:  pmem(rss=534646784, vms=1907343360, shared=12107776, text=4096, lib=0, data=563171328, dirty=0) \n",
+      "Memory used by process 18708: pmem(rss=544612352, vms=2047995904, shared=10641408, text=4120576, lib=0, data=823377920, dirty=0)\n",
       "---\n",
-      "Process Id:  320\n",
+      "Process Id: 18715\n",
       "\n",
-      "Memory used by process 320:  pmem(rss=534663168, vms=1907343360, shared=12107776, text=4096, lib=0, data=563204096, dirty=0) \n",
+      "Memory used by process 18715: pmem(rss=544624640, vms=2047995904, shared=10641408, text=4120576, lib=0, data=823386112, dirty=0)\n",
       "---\n",
-      "CPU times: user 540 ms, sys: 180 ms, total: 720 ms\n",
-      "Wall time: 24.5 s\n"
+      "CPU times: user 464 ms, sys: 68 ms, total: 532 ms\n",
+      "Wall time: 45.3 s\n"
      ]
     }
    ],
@@ -569,16 +540,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Process Id:  329\n",
+      "Process Id: 18733\n",
       "\n",
-      "Memory used by process 329:  pmem(rss=514174976, vms=1885904896, shared=142942208, text=4096, lib=0, data=411869184, dirty=0) \n",
+      "Memory used by process 18733: pmem(rss=525369344, vms=2028597248, shared=140480512, text=4120576, lib=0, data=674148352, dirty=0)\n",
       "---\n",
-      "Process Id:  338\n",
+      "Process Id: 18740\n",
       "\n",
-      "Memory used by process 338:  pmem(rss=514174976, vms=1885904896, shared=142942208, text=4096, lib=0, data=411869184, dirty=0) \n",
+      "Memory used by process 18740: pmem(rss=525365248, vms=2028597248, shared=140480512, text=4120576, lib=0, data=674148352, dirty=0)\n",
       "---\n",
-      "CPU times: user 490 ms, sys: 210 ms, total: 700 ms\n",
-      "Wall time: 2.62 s\n"
+      "CPU times: user 444 ms, sys: 96 ms, total: 540 ms\n",
+      "Wall time: 2.06 s\n"
      ]
     }
    ],
@@ -617,9 +588,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
@@ -719,9 +688,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# To export our model as text\n",
@@ -737,18 +704,19 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "71290 100\n",
-      "the 0.405333 0.074649 0.154192 0.091247 -0.036666 -0.079057 0.056531 0.012814 0.046281 0.056158 0.209166 -0.046209 0.252618 0.022687 0.239388 -0.122108 -0.028497 -0.098760 -0.334427 0.029130 0.117470 -0.237462 0.064778 -0.053481 -0.165359 0.223160 0.104593 0.144142 0.115136 0.142812 0.201899 0.171716 0.256478 0.142440 -0.150566 -0.175130 0.144592 0.156056 -0.181402 0.103827 -0.173085 0.053641 -0.085016 0.367614 -0.225947 0.033068 0.079073 0.134803 -0.303063 -0.104457 0.079638 -0.135635 -0.072654 0.001361 0.187478 -0.221080 -0.111177 0.071005 0.091342 0.020156 -0.157671 -0.075755 0.098052 -0.065106 0.201720 -0.064369 0.080100 -0.238081 -0.078123 -0.156004 -0.053440 0.234423 -0.117426 -0.127303 0.180088 -0.004023 -0.042677 0.059902 0.453670 -0.063391 -0.049869 0.060019 0.104559 0.085386 -0.071030 -0.117753 -0.032831 0.009222 0.100854 0.082896 -0.288745 -0.015596 -0.138211 0.017519 -0.044955 -0.002358 -0.084262 -0.127057 0.155300 0.342515\n",
-      "of 0.302899 0.135698 0.276234 0.060655 -0.121023 -0.036229 0.251403 0.087931 0.143489 0.086507 0.171695 -0.108421 0.168884 0.031430 0.128453 -0.157933 -0.041587 -0.012564 -0.242977 -0.134526 0.098855 -0.125527 0.114153 -0.197138 -0.167243 0.415763 -0.067183 0.244922 0.044159 0.178697 0.244680 0.156735 0.322327 0.050362 -0.196953 -0.211732 0.300875 0.184376 -0.071861 -0.000714 0.028612 0.156463 0.046373 0.274268 -0.103168 -0.144895 0.079764 0.314170 -0.236254 -0.108111 0.012367 -0.053291 0.079590 -0.057262 0.221644 -0.259905 -0.120234 0.005212 0.096316 -0.044126 -0.212473 -0.228809 0.089850 -0.023453 0.316282 0.087361 0.168300 -0.239052 0.062733 -0.178071 -0.023161 0.146075 -0.150015 -0.191352 0.136295 0.082557 -0.043620 0.213094 0.413238 -0.205452 -0.115454 -0.051733 0.132394 0.093741 -0.128791 -0.159032 0.015310 -0.135258 -0.099603 -0.042002 -0.193415 -0.032718 -0.341820 0.002871 -0.069954 -0.009055 -0.073843 -0.043583 0.052326 0.348435\n"
+      "b'71290 100'\n",
+      "b'the 0.141686 0.255228 -0.191478 0.232801 0.094346 0.120224 0.075487 0.032936 0.154292 -0.063886 -0.321305 0.128102 0.072219 0.081531 -0.080868 -0.000505 -0.094688 -0.031570 -0.022748 -0.030894 0.118537 -0.091672 0.268565 0.017336 -0.158142 0.028882 -0.354505 -0.248104 0.114017 -0.132821 -0.068284 -0.311653 -0.109148 0.071787 0.391749 0.027252 -0.192908 0.323144 0.100474 -0.049426 -0.157461 -0.289598 0.148029 0.059920 -0.084889 -0.012278 0.041439 0.109375 -0.123536 -0.001224 0.112495 -0.138175 0.114445 -0.208958 0.253858 -0.033594 0.145608 0.295680 -0.008925 0.032524 0.192903 0.035965 0.135603 -0.103187 0.162365 0.031851 0.017547 -0.106019 0.094497 0.071965 0.068053 0.024725 -0.003645 0.001062 0.078102 -0.172048 0.093869 -0.035663 -0.166211 0.176462 0.049964 -0.114905 0.024031 -0.058539 -0.117258 -0.351215 -0.025666 -0.211885 0.036296 -0.326675 -0.182654 -0.019680 -0.189521 -0.206698 -0.100391 0.120583 0.076890 -0.010218 0.084345 -0.277560'\n",
+      "b'of 0.042654 0.329115 -0.062874 0.331052 0.041591 0.141496 0.023409 0.054587 0.003090 0.059803 -0.190404 0.169919 -0.001547 -0.005588 0.060066 0.089611 -0.072265 -0.230048 -0.028314 -0.115761 0.126566 -0.054547 0.366766 0.045456 0.011724 0.010946 -0.237676 -0.323509 0.232554 -0.039293 -0.049269 -0.085853 -0.215061 0.130000 0.347488 0.165928 -0.169574 0.305217 -0.017916 0.034427 -0.133006 -0.144247 0.150204 0.120708 0.053237 -0.183496 0.053565 0.030120 -0.115428 0.030555 0.115227 -0.206632 -0.043280 -0.194560 0.220410 -0.107236 -0.003629 0.253298 0.048558 -0.040416 0.225557 0.091650 0.052787 -0.052910 0.101683 0.113876 -0.105539 -0.056264 0.159010 0.211075 0.057890 -0.017479 0.124350 0.032155 0.097972 -0.220727 0.148302 -0.019309 -0.098981 0.180954 -0.064003 -0.011532 0.148809 0.071048 0.002689 -0.310323 -0.272785 -0.213483 0.030733 -0.217041 -0.346220 0.031555 -0.209962 -0.303856 -0.218638 0.012904 0.188286 0.030006 0.090853 -0.374457'\n"
      ]
     }
    ],
    "source": [
+    "from smart_open import smart_open\n",
     "# View the first 3 lines of the exported file\n",
     "\n",
     "# The first line has the total number of entries and the vector dimension count. \n",
     "# The next lines have a key (a string) followed by its vector.\n",
-    "with open('/tmp/vectors.txt') as myfile:\n",
+    "with smart_open('/tmp/vectors.txt') as myfile:\n",
     "    for i in range(3):\n",
     "        print(myfile.readline().strip())"
    ]
@@ -900,7 +868,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.0"
+   "version": "3.5.3"
   }
  },
  "nbformat": 4,