diff --git a/README.rst b/README.rst index ee5200c..562b393 100644 --- a/README.rst +++ b/README.rst @@ -43,3 +43,10 @@ text file which has the list of all user-agents Now all the requests from your crawler will have a random user-agent picked from the text file. + + +If you wish to disable the random user agent middleware on a request basis, you can use a meta flag. + +.. code-block:: python + + scrapy.Request('https://...', callback=function, meta={'skip_useragent': True}) diff --git a/random_useragent.py b/random_useragent.py index 0240eaf..2d472b5 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -7,6 +7,12 @@ """ import random +try: + from functools import lru_cache # python3 only +except ImportError: + lru_cache = lambda maxsize: lambda f: f # noqa + + from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware @@ -14,10 +20,16 @@ __copyright__ = "Copyright 2016, Srinivasan Rangarajan" __credits__ = ["Srinivasan Rangarajan"] __license__ = "MIT" -__version__ = "0.2" -__maintainer__ = "Srinivasan Rangarajan" -__email__ = "srinivasanr@gmail.com" -__status__ = "Development" +__version__ = "0.3" +__maintainer__ = "Julien Marechal" +__email__ = "" +__status__ = "Release" + + +@lru_cache +def file_get_user_agent_list(user_agent_list_file): + with open(user_agent_list_file, 'r') as f: + return [line.strip() for line in f.readlines()] class RandomUserAgentMiddleware(UserAgentMiddleware): @@ -33,8 +45,9 @@ def __init__(self, settings, user_agent='Scrapy'): ua = settings.get('USER_AGENT', user_agent) self.user_agent_list = [ua] else: - with open(user_agent_list_file, 'r') as f: - self.user_agent_list = [line.strip() for line in f.readlines()] + self.user_agent_list = file_get_user_agent_list( + user_agent_list_file + ) @classmethod def from_crawler(cls, crawler): @@ -44,6 +57,9 @@ def from_crawler(cls, crawler): return obj def process_request(self, request, spider): + if request.meta.get('skip_useragent'): + return + user_agent = random.choice(self.user_agent_list) if user_agent: request.headers.setdefault('User-Agent', user_agent) diff --git a/setup.py b/setup.py index b6c841d..f5f6201 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,10 @@ def get_package_meta(meta_name): named in the Python meta format `____`. """ regex = "__{0}__ = ['\"]([^'\"]+)['\"]".format(meta_name) - return re.search(regex, package_file).group(1) + res = re.search(regex, package_file) + if res: + return res.group(1) + return "" version = get_package_meta('version')