diff --git a/README.rst b/README.rst index ee5200c..a6ab08b 100644 --- a/README.rst +++ b/README.rst @@ -40,6 +40,10 @@ text file which has the list of all user-agents .. code-block:: python USER_AGENT_LIST = "/path/to/useragents.txt" + # User Agent mode + # 0 = Each request has different user agent(default) + # 1 = Take only one user agent from the list and assign it to every requests + USER_AGENT_MODE = 0 Now all the requests from your crawler will have a random user-agent picked from the text file. diff --git a/random_useragent.py b/random_useragent.py index 0240eaf..b469262 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -7,18 +7,20 @@ """ import random +import logging from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware __author__ = "Srinivasan Rangarajan" __copyright__ = "Copyright 2016, Srinivasan Rangarajan" -__credits__ = ["Srinivasan Rangarajan"] +__credits__ = ["Srinivasan Rangarajan","Alessio Pollero"] __license__ = "MIT" -__version__ = "0.2" +__version__ = "0.3" __maintainer__ = "Srinivasan Rangarajan" __email__ = "srinivasanr@gmail.com" __status__ = "Development" +log = logging.getLogger('scrapy.useragents') class RandomUserAgentMiddleware(UserAgentMiddleware): @@ -35,6 +37,8 @@ def __init__(self, settings, user_agent='Scrapy'): else: with open(user_agent_list_file, 'r') as f: self.user_agent_list = [line.strip() for line in f.readlines()] + if(settings.get('USER_AGENT_MODE', user_agent) == 1): + self.user_agent_list = [random.choice(self.user_agent_list)] @classmethod def from_crawler(cls, crawler): @@ -46,4 +50,8 @@ def from_crawler(cls, crawler): def process_request(self, request, spider): user_agent = random.choice(self.user_agent_list) if user_agent: + log.debug('Using user agent: ' + user_agent) request.headers.setdefault('User-Agent', user_agent) + if ('splash' in request.meta): + request.meta['splash']['args']['ua'] = user_agent +