remove pattern dependency (#3012)

* get rid of pattern dependency * get rid of six import in mmreader.pyx * bump cython version to 0.29.21 Trying to work around "has no attribute '__reduce_cython__'" problem * add six to list of dependencies Why was it removed? Parts of the code still need it. * rm removed file from docs * Revert "add six to list of dependencies" This reverts commit 4e52814. * remove unused six import * add friendly message * update gitignore to include cython output * update gitignore * fix build * Update docs/src/scripts/make_wiki_online.rst * more friendliness * Update gensim/scripts/segment_wiki.py * Update gensim/scripts/segment_wiki.py * skip broken tests * flake8 fix * Update CHANGELOG.md Co-authored-by: Radim Řehůřek <radimrehurek@seznam.cz>
piskvorky · Jan 17, 2021 · 67f45da · 67f45da
1 parent 959f2dd
commit 67f45da
Show file tree

Hide file tree

Showing 15 changed files with 55 additions and 269 deletions.
diff --git a/.gitignore b/.gitignore
@@ -76,3 +76,18 @@ data
 *.inv
 *.js
 docs/_images/
+
+#
+# Generated by Cython
+#
+gensim/_matutils.c
+gensim/corpora/_mmreader.c
+gensim/models/doc2vec_corpusfile.cpp
+gensim/models/doc2vec_inner.cpp
+gensim/models/fasttext_corpusfile.cpp
+gensim/models/fasttext_inner.c
+gensim/models/nmf_pgd.c
+gensim/models/word2vec_corpusfile.cpp
+gensim/models/word2vec_inner.c
+
+.ipynb_checkpoints
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -104,6 +104,7 @@ Production stability is important to Gensim, so we're improving the process of *
 * [#2926](https://github.com/RaRe-Technologies/gensim/pull/2926): Rename `num_words` to `topn` in dtm_coherence, by [@MeganStodel](https://github.com/MeganStodel)
 * [#2937](https://github.com/RaRe-Technologies/gensim/pull/2937): Remove Keras dependency, by [@piskvorky](https://github.com/piskvorky)
 * Removed all code, methods, attributes and functions marked as deprecated in [Gensim 3.8.3](https://github.com/RaRe-Technologies/gensim/releases/tag/3.8.3).
+* Removed pattern dependency (PR [#3012](https://github.com/RaRe-Technologies/gensim/pull/3012), [@mpenkov](https://github.com/mpenkov)).  If you need to lemmatize, do it prior to passing the corpus to gensim.
 
 ---
 

diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -89,7 +89,6 @@ Modules:
     scripts/make_wikicorpus
     scripts/word2vec_standalone
     scripts/make_wiki_online
-    scripts/make_wiki_online_lemma
     scripts/make_wiki_online_nodebug
     scripts/word2vec2tensor
     scripts/segment_wiki

diff --git a/docs/src/scripts/make_wiki_online_lemma.rst b/docs/src/scripts/make_wiki_online_lemma.rst
diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx
@@ -7,8 +7,6 @@ from __future__ import with_statement
 
 from gensim import utils
 
-from six import string_types
-from six.moves import range
 import logging
 
 cimport cython
@@ -187,7 +185,7 @@ cdef class MmReader():
 
         if offset == -1:
             return []
-        if isinstance(self.input, string_types):
+        if isinstance(self.input, str):
             fin, close_fin = utils.open(self.input, 'rb'), True
         else:
             fin, close_fin = self.input, False

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -12,11 +12,8 @@
 
 Notes
 -----
-If you have the `pattern <https://github.com/clips/pattern>`_ package installed,
-this module will use a fancy lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer).
 
 See :mod:`gensim.scripts.make_wiki` for a canned (example) command-line script based on this module.
-
 """
 
 import bz2
@@ -467,9 +464,8 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
 
     Parameters
     ----------
-    args : (str, bool, str, int)
-        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
-        page identificator.
+    args : (str, str, int)
+        Article text, article title, page identificator.
     tokenizer_func : function
         Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
         Needs to have interface:
@@ -487,12 +483,9 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
         List of tokens from article, title and page id.
 
     """
-    text, lemmatize, title, pageid = args
+    text, title, pageid = args
     text = filter_wiki(text)
-    if lemmatize:
-        result = utils.lemmatize(text)
-    else:
-        result = tokenizer_func(text, token_min_len, token_max_len, lower)
+    result = tokenizer_func(text, token_min_len, token_max_len, lower)
     return result, title, pageid
 
 
@@ -574,7 +567,7 @@ class WikiCorpus(TextCorpus):
         >>> MmCorpus.serialize(corpus_path, wiki)  # another 8h, creates a file in MatrixMarket format and mapping
 
     """
-    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
+    def __init__(self, fname, processes=None, lemmatize=None, dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                  token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
         """Initialize the corpus.
@@ -588,9 +581,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
             Path to the Wikipedia dump file.
         processes : int, optional
             Number of processes to run, defaults to `max(1, number of cpu - 1)`.
-        lemmatize : bool
-            Use lemmatization instead of simple regexp tokenization.
-            Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed.
         dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
             Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
             **IMPORTANT: this needs a really long time**.
@@ -618,14 +608,20 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.
 
         """
+        if lemmatize is not None:
+            raise NotImplementedError(
+                'The lemmatize parameter is no longer supported. '
+                'If you need to lemmatize, use e.g. <https://github.com/clips/pattern>. '
+                'Perform lemmatization as part of your tokenization function and '
+                'pass it as the tokenizer_func parameter to this initializer.'
+            )
         self.fname = fname
         self.filter_namespaces = filter_namespaces
         self.filter_articles = filter_articles
         self.metadata = False
         if processes is None:
             processes = max(1, multiprocessing.cpu_count() - 1)
         self.processes = processes
-        self.lemmatize = lemmatize
         self.tokenizer_func = tokenizer_func
         self.article_min_tokens = article_min_tokens
         self.token_min_len = token_min_len
@@ -677,7 +673,7 @@ def get_texts(self):
 
         tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower)
         texts = (
-            (text, self.lemmatize, title, pageid, tokenization_params)
+            (text, title, pageid, tokenization_params)
             for title, text, pageid
             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)
         )

diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx
@@ -15,7 +15,6 @@ import cython
 import numpy as np
 
 from gensim.utils import any2utf8
-from six import iteritems
 
 cimport numpy as np
 

diff --git a/gensim/scripts/make_wiki_lemma.py b/gensim/scripts/make_wiki_lemma.py
diff --git a/gensim/scripts/make_wiki_online_lemma.py b/gensim/scripts/make_wiki_online_lemma.py
diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py
@@ -29,10 +29,6 @@
 removing tokens that appear in more than 10%% of all documents). Defaults to
 100,000.
 
-If you have the `pattern` package installed, this script will use a fancy
-lemmatization to get a lemma of each token (instead of plain alphabetic
-tokenizer). The package is available at https://github.com/clips/pattern .
-
 Example:
   python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
 """
@@ -74,13 +70,12 @@
     else:
         keep_words = DEFAULT_DICT_SIZE
     online = 'online' in program
-    lemmatize = 'lemma' in program
     debug = 'nodebug' not in program
 
     if online:
         dictionary = HashDictionary(id_range=keep_words, debug=debug)
         dictionary.allow_update = True  # start collecting document frequencies
-        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
+        wiki = WikiCorpus(inp, dictionary=dictionary)
         # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
         MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True)
         # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
@@ -89,7 +84,7 @@
         wiki.save(outp + '_corpus.pkl.bz2')
         dictionary.allow_update = False
     else:
-        wiki = WikiCorpus(inp, lemmatize=lemmatize)  # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
+        wiki = WikiCorpus(inp)  # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
         # only keep the most frequent words (out of total ~8.2m unique tokens)
         wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
         # save dictionary and bag-of-words (term-document frequency matrix)

diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
@@ -268,7 +268,7 @@ class _WikiSectionsCorpus(WikiCorpus):
     """
 
     def __init__(self, fileobj, min_article_character=200, processes=None,
-                 lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False):
+                 lemmatize=None, filter_namespaces=('0',), include_interlinks=False):
         """
         Parameters
         ----------
@@ -278,22 +278,25 @@ def __init__(self, fileobj, min_article_character=200, processes=None,
             Minimal number of character for article (except titles and leading gaps).
         processes : int, optional
             Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
-        lemmatize : bool, optional
-            If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
-            Otherwise, use simple regexp tokenization.
         filter_namespaces : tuple of int, optional
             Enumeration of namespaces that will be ignored.
         include_interlinks: bool
             Whether or not interlinks should be included in the output
 
         """
+        if lemmatize is not None:
+            raise NotImplementedError(
+                'The lemmatize parameter is no longer supported since Gensim 4.0.0. '
+                'If you need to lemmatize, use e.g. https://github.com/clips/pattern '
+                'to preprocess your corpus before submitting it to Gensim.'
+            )
+
         self.fileobj = fileobj
         self.filter_namespaces = filter_namespaces
         self.metadata = False
         if processes is None:
             processes = max(1, multiprocessing.cpu_count() - 1)
         self.processes = processes
-        self.lemmatize = lemmatize
         self.min_article_character = min_article_character
         self.include_interlinks = include_interlinks