From b0b56da529da77e8eaabe95139ae6b502cfa7927 Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Tue, 24 Oct 2017 12:19:17 +0200 Subject: [PATCH 1/2] Add same USER AGENT for all requests --- README.rst | 4 ++++ random_useragent.py | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index ee5200c..a6ab08b 100644 --- a/README.rst +++ b/README.rst @@ -40,6 +40,10 @@ text file which has the list of all user-agents .. code-block:: python USER_AGENT_LIST = "/path/to/useragents.txt" + # User Agent mode + # 0 = Each request has different user agent(default) + # 1 = Take only one user agent from the list and assign it to every requests + USER_AGENT_MODE = 0 Now all the requests from your crawler will have a random user-agent picked from the text file. diff --git a/random_useragent.py b/random_useragent.py index 0240eaf..466b213 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -7,18 +7,20 @@ """ import random +import logging from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware __author__ = "Srinivasan Rangarajan" __copyright__ = "Copyright 2016, Srinivasan Rangarajan" -__credits__ = ["Srinivasan Rangarajan"] +__credits__ = ["Srinivasan Rangarajan","Alessio Pollero"] __license__ = "MIT" -__version__ = "0.2" +__version__ = "0.3" __maintainer__ = "Srinivasan Rangarajan" __email__ = "srinivasanr@gmail.com" __status__ = "Development" +log = logging.getLogger('scrapy.useragents') class RandomUserAgentMiddleware(UserAgentMiddleware): @@ -35,6 +37,8 @@ def __init__(self, settings, user_agent='Scrapy'): else: with open(user_agent_list_file, 'r') as f: self.user_agent_list = [line.strip() for line in f.readlines()] + if(settings.get('USER_AGENT_MODE', user_agent) == 1): + self.user_agent_list = [random.choice(self.user_agent_list)] @classmethod def from_crawler(cls, crawler): @@ -46,4 +50,5 @@ def from_crawler(cls, crawler): def process_request(self, request, spider): user_agent = random.choice(self.user_agent_list) if user_agent: + log.debug('Using user agent: ' + user_agent) request.headers.setdefault('User-Agent', user_agent) From 9477777d8e54eeae462e7d947c950b2a3a89eb87 Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Wed, 28 Mar 2018 16:05:53 +0200 Subject: [PATCH 2/2] Set splash user agent in the case the request is forwarded to Splash --- random_useragent.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/random_useragent.py b/random_useragent.py index 466b213..b469262 100644 --- a/random_useragent.py +++ b/random_useragent.py @@ -52,3 +52,6 @@ def process_request(self, request, spider): if user_agent: log.debug('Using user agent: ' + user_agent) request.headers.setdefault('User-Agent', user_agent) + if ('splash' in request.meta): + request.meta['splash']['args']['ua'] = user_agent +