Skip to content

Commit

Permalink
remove pattern dependency (#3012)
Browse files Browse the repository at this point in the history
* get rid of pattern dependency

* get rid of six import in mmreader.pyx

* bump cython version to 0.29.21

Trying to work around "has no attribute '__reduce_cython__'" problem

* add six to list of dependencies

Why was it removed? Parts of the code still need it.

* rm removed file from docs

* Revert "add six to list of dependencies"

This reverts commit 4e52814.

* remove unused six import

* add friendly message

* update gitignore to include cython output

* update gitignore

* fix build

* Update docs/src/scripts/make_wiki_online.rst

* more friendliness

* Update gensim/scripts/segment_wiki.py

* Update gensim/scripts/segment_wiki.py

* skip broken tests

* flake8 fix

* Update CHANGELOG.md

Co-authored-by: Radim Řehůřek <radimrehurek@seznam.cz>
  • Loading branch information
mpenkov and piskvorky authored Jan 17, 2021
1 parent 959f2dd commit 67f45da
Show file tree
Hide file tree
Showing 15 changed files with 55 additions and 269 deletions.
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,18 @@ data
*.inv
*.js
docs/_images/

#
# Generated by Cython
#
gensim/_matutils.c
gensim/corpora/_mmreader.c
gensim/models/doc2vec_corpusfile.cpp
gensim/models/doc2vec_inner.cpp
gensim/models/fasttext_corpusfile.cpp
gensim/models/fasttext_inner.c
gensim/models/nmf_pgd.c
gensim/models/word2vec_corpusfile.cpp
gensim/models/word2vec_inner.c

.ipynb_checkpoints
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ Production stability is important to Gensim, so we're improving the process of *
* [#2926](https://github.com/RaRe-Technologies/gensim/pull/2926): Rename `num_words` to `topn` in dtm_coherence, by [@MeganStodel](https://github.com/MeganStodel)
* [#2937](https://github.com/RaRe-Technologies/gensim/pull/2937): Remove Keras dependency, by [@piskvorky](https://github.com/piskvorky)
* Removed all code, methods, attributes and functions marked as deprecated in [Gensim 3.8.3](https://github.com/RaRe-Technologies/gensim/releases/tag/3.8.3).
* Removed pattern dependency (PR [#3012](https://github.com/RaRe-Technologies/gensim/pull/3012), [@mpenkov](https://github.com/mpenkov)). If you need to lemmatize, do it prior to passing the corpus to gensim.

---

Expand Down
1 change: 0 additions & 1 deletion docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ Modules:
scripts/make_wikicorpus
scripts/word2vec_standalone
scripts/make_wiki_online
scripts/make_wiki_online_lemma
scripts/make_wiki_online_nodebug
scripts/word2vec2tensor
scripts/segment_wiki
Expand Down
9 changes: 0 additions & 9 deletions docs/src/scripts/make_wiki_online_lemma.rst

This file was deleted.

4 changes: 1 addition & 3 deletions gensim/corpora/_mmreader.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ from __future__ import with_statement

from gensim import utils

from six import string_types
from six.moves import range
import logging

cimport cython
Expand Down Expand Up @@ -187,7 +185,7 @@ cdef class MmReader():

if offset == -1:
return []
if isinstance(self.input, string_types):
if isinstance(self.input, str):
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False
Expand Down
30 changes: 13 additions & 17 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,8 @@
Notes
-----
If you have the `pattern <https://github.com/clips/pattern>`_ package installed,
this module will use a fancy lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer).
See :mod:`gensim.scripts.make_wiki` for a canned (example) command-line script based on this module.
"""

import bz2
Expand Down Expand Up @@ -467,9 +464,8 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
Parameters
----------
args : (str, bool, str, int)
Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
page identificator.
args : (str, str, int)
Article text, article title, page identificator.
tokenizer_func : function
Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
Needs to have interface:
Expand All @@ -487,12 +483,9 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
List of tokens from article, title and page id.
"""
text, lemmatize, title, pageid = args
text, title, pageid = args
text = filter_wiki(text)
if lemmatize:
result = utils.lemmatize(text)
else:
result = tokenizer_func(text, token_min_len, token_max_len, lower)
result = tokenizer_func(text, token_min_len, token_max_len, lower)
return result, title, pageid


Expand Down Expand Up @@ -574,7 +567,7 @@ class WikiCorpus(TextCorpus):
>>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
def __init__(self, fname, processes=None, lemmatize=None, dictionary=None,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
"""Initialize the corpus.
Expand All @@ -588,9 +581,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
Path to the Wikipedia dump file.
processes : int, optional
Number of processes to run, defaults to `max(1, number of cpu - 1)`.
lemmatize : bool
Use lemmatization instead of simple regexp tokenization.
Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Dictionary, if not provided, this scans the corpus once, to determine its vocabulary
**IMPORTANT: this needs a really long time**.
Expand Down Expand Up @@ -618,14 +608,20 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.
"""
if lemmatize is not None:
raise NotImplementedError(
'The lemmatize parameter is no longer supported. '
'If you need to lemmatize, use e.g. <https://github.com/clips/pattern>. '
'Perform lemmatization as part of your tokenization function and '
'pass it as the tokenizer_func parameter to this initializer.'
)
self.fname = fname
self.filter_namespaces = filter_namespaces
self.filter_articles = filter_articles
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
self.tokenizer_func = tokenizer_func
self.article_min_tokens = article_min_tokens
self.token_min_len = token_min_len
Expand Down Expand Up @@ -677,7 +673,7 @@ def get_texts(self):

tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower)
texts = (
(text, self.lemmatize, title, pageid, tokenization_params)
(text, title, pageid, tokenization_params)
for title, text, pageid
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)
)
Expand Down
1 change: 0 additions & 1 deletion gensim/models/word2vec_corpusfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import cython
import numpy as np

from gensim.utils import any2utf8
from six import iteritems

cimport numpy as np

Expand Down
1 change: 0 additions & 1 deletion gensim/scripts/make_wiki_lemma.py

This file was deleted.

112 changes: 0 additions & 112 deletions gensim/scripts/make_wiki_online_lemma.py

This file was deleted.

9 changes: 2 additions & 7 deletions gensim/scripts/make_wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@
removing tokens that appear in more than 10%% of all documents). Defaults to
100,000.
If you have the `pattern` package installed, this script will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .
Example:
python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
"""
Expand Down Expand Up @@ -74,13 +70,12 @@
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program

if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
wiki = WikiCorpus(inp, dictionary=dictionary)
# ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
Expand All @@ -89,7 +84,7 @@
wiki.save(outp + '_corpus.pkl.bz2')
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
wiki = WikiCorpus(inp) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
Expand Down
13 changes: 8 additions & 5 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ class _WikiSectionsCorpus(WikiCorpus):
"""

def __init__(self, fileobj, min_article_character=200, processes=None,
lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False):
lemmatize=None, filter_namespaces=('0',), include_interlinks=False):
"""
Parameters
----------
Expand All @@ -278,22 +278,25 @@ def __init__(self, fileobj, min_article_character=200, processes=None,
Minimal number of character for article (except titles and leading gaps).
processes : int, optional
Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
lemmatize : bool, optional
If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
Otherwise, use simple regexp tokenization.
filter_namespaces : tuple of int, optional
Enumeration of namespaces that will be ignored.
include_interlinks: bool
Whether or not interlinks should be included in the output
"""
if lemmatize is not None:
raise NotImplementedError(
'The lemmatize parameter is no longer supported since Gensim 4.0.0. '
'If you need to lemmatize, use e.g. https://github.com/clips/pattern '
'to preprocess your corpus before submitting it to Gensim.'
)

self.fileobj = fileobj
self.filter_namespaces = filter_namespaces
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
self.min_article_character = min_article_character
self.include_interlinks = include_interlinks

Expand Down
Loading

0 comments on commit 67f45da

Please sign in to comment.