From e17f5e761769de7b2f6a0747eeaf75c8beed1c23 Mon Sep 17 00:00:00 2001 From: Julien Marechal Date: Sun, 30 Jul 2017 11:50:14 +0200 Subject: [PATCH 1/6] Added caching for less disk IO and flag to skip the useragent --- random_useragent.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/random_useragent.py b/random_useragent.py index 0240eaf..23f50a2 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -7,6 +7,7 @@ """ import random +from functools import lru_cache # python3 only from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware @@ -14,10 +15,16 @@ __copyright__ = "Copyright 2016, Srinivasan Rangarajan" __credits__ = ["Srinivasan Rangarajan"] __license__ = "MIT" -__version__ = "0.2" -__maintainer__ = "Srinivasan Rangarajan" -__email__ = "srinivasanr@gmail.com" -__status__ = "Development" +__version__ = "0.3" +__maintainer__ = "Julien Marechal" +__email__ = "" +__status__ = "Release" + + +@lru_cache +def file_get_user_agent_list(user_agent_list_file): + with open(user_agent_list_file, 'r') as f: + return [line.strip() for line in f.readlines()] class RandomUserAgentMiddleware(UserAgentMiddleware): @@ -33,8 +40,9 @@ def __init__(self, settings, user_agent='Scrapy'): ua = settings.get('USER_AGENT', user_agent) self.user_agent_list = [ua] else: - with open(user_agent_list_file, 'r') as f: - self.user_agent_list = [line.strip() for line in f.readlines()] + self.user_agent_list = file_get_user_agent_list( + user_agent_list_file + ) @classmethod def from_crawler(cls, crawler): @@ -44,6 +52,9 @@ def from_crawler(cls, crawler): return obj def process_request(self, request, spider): + if request.meta.get('skip_useragent'): + return request + user_agent = random.choice(self.user_agent_list) if user_agent: request.headers.setdefault('User-Agent', user_agent) From eb2afa1cad51bb7bd2fef96bebcea3d7ce97119b Mon Sep 17 00:00:00 2001 From: Julien Marechal Date: Sun, 30 Jul 2017 11:54:50 +0200 Subject: [PATCH 2/6] fixed setup --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b6c841d..f5f6201 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,10 @@ def get_package_meta(meta_name): named in the Python meta format `____`. """ regex = "__{0}__ = ['\"]([^'\"]+)['\"]".format(meta_name) - return re.search(regex, package_file).group(1) + res = re.search(regex, package_file) + if res: + return res.group(1) + return "" version = get_package_meta('version') From 8ebe43eb27257f5cf9a82b123fc107e0ccf5441f Mon Sep 17 00:00:00 2001 From: Julien Marechal Date: Sun, 30 Jul 2017 12:24:52 +0200 Subject: [PATCH 3/6] Added doc + python2 fallback --- README.rst | 6 ++++++ random_useragent.py | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index ee5200c..0834723 100644 --- a/README.rst +++ b/README.rst @@ -37,6 +37,12 @@ Then, create a new variable ``USER_AGENT_LIST`` with the path to your text file which has the list of all user-agents (one user-agent per line). +If you wish to disable the random user agent middleware on a request basis, you can us a meta flag. + +.. code-block:: python + + scrapy.Request('https://...', callback=function, meta={'skip_useragent': True}) + .. code-block:: python USER_AGENT_LIST = "/path/to/useragents.txt" diff --git a/random_useragent.py b/random_useragent.py index 23f50a2..4a64ad6 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -7,7 +7,12 @@ """ import random -from functools import lru_cache # python3 only +try: + from functools import lru_cache # python3 only +except ImportError: + lru_cache = lambda maxsize: lambda f: f # noqa + + from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware From d44ff181baa552cded51172869eced194be5cec0 Mon Sep 17 00:00:00 2001 From: Julien Marechal Date: Sun, 30 Jul 2017 12:25:50 +0200 Subject: [PATCH 4/6] re-order docs --- README.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 0834723..96fc56d 100644 --- a/README.rst +++ b/README.rst @@ -37,15 +37,16 @@ Then, create a new variable ``USER_AGENT_LIST`` with the path to your text file which has the list of all user-agents (one user-agent per line). -If you wish to disable the random user agent middleware on a request basis, you can us a meta flag. - -.. code-block:: python - - scrapy.Request('https://...', callback=function, meta={'skip_useragent': True}) - .. code-block:: python USER_AGENT_LIST = "/path/to/useragents.txt" Now all the requests from your crawler will have a random user-agent picked from the text file. + + +If you wish to disable the random user agent middleware on a request basis, you can us a meta flag. + +.. code-block:: python + + scrapy.Request('https://...', callback=function, meta={'skip_useragent': True}) From d63aeb78e067e7147bf6653fe99b8ec7daad650b Mon Sep 17 00:00:00 2001 From: Julien Marechal Date: Sun, 30 Jul 2017 12:26:34 +0200 Subject: [PATCH 5/6] typo --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 96fc56d..562b393 100644 --- a/README.rst +++ b/README.rst @@ -45,7 +45,7 @@ Now all the requests from your crawler will have a random user-agent picked from the text file. -If you wish to disable the random user agent middleware on a request basis, you can us a meta flag. +If you wish to disable the random user agent middleware on a request basis, you can use a meta flag. .. code-block:: python From 5130e9facda57124124f6f6f8b07395d810aa5c6 Mon Sep 17 00:00:00 2001 From: Julien Marechal Date: Sun, 30 Jul 2017 14:14:32 +0200 Subject: [PATCH 6/6] return None to continue --- random_useragent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/random_useragent.py b/random_useragent.py index 4a64ad6..2d472b5 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -58,7 +58,7 @@ def from_crawler(cls, crawler): def process_request(self, request, spider): if request.meta.get('skip_useragent'): - return request + return user_agent = random.choice(self.user_agent_list) if user_agent: