From 0db7ce8de3284f8be460796dd0d111a027b04704 Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Thu, 11 Dec 2014 12:55:40 -0500
Subject: [PATCH] fixes #35

---
 README.md | 16 +++----------
 test.py   |  9 --------
 twarc.py  | 67 ++-----------------------------------------------------
 3 files changed, 5 insertions(+), 87 deletions(-)

diff --git a/README.md b/README.md
index 40f63828..4baf641d 100644
--- a/README.md
+++ b/README.md
@@ -39,16 +39,6 @@ to share Twitter IDs instead. You can use twarc to "hydrate" them:
 
     twarc.py --hydrate ids.txt > tweets.json
 
-### Scrape Mode
-
-The first time you fetch tweets for a query if you pass the --scrape option
-it will use [search.twitter.com](http://search.twitter.com) to discover tweet
-IDs, and then use the Twitter REST API to fetch the JSON for each tweet. This
-is an expensive operation because each ID needs to be fetched from the API
-which counts as a request against your quota.
-
-[Twitter Search](http://search.twitter.com) [now supports](http://blog.twitter.com/2013/02/now-showing-older-tweets-in-search.html) drilling backwards in time, past the week cutoff of the REST API. Since individual tweets are still retrieved with the REST API, rate limits apply--so this is quite a slow process. Still, if you are willing to let it run for a while it can be useful to query for older tweets, until the official search REST API supports a more historical perspective.
-
 ### Use as a Library
 
 If you want you can use twarc to get a stream of tweets from a search as JSON
@@ -106,17 +96,17 @@ Or if you want to output [GeoJSON](http://geojson.org/) from tweets where geo co
 
 Or if you have duplicate tweets in your JSON, deduplicate using:
 
-    % ./twarc.py --scrape --query nasa
+    % ./twarc.py --query nasa
     % utils/deduplicate.py nasa-20130306102105.json > deduped.json
 
 Or if you want to sort by ID, which is analogous to sorting by time:
 
-    % ./twarc.py --scrape --query nasa
+    % ./twarc.py --query nasa
     % utils/sort_by_id.py nasa-20130306102105.json > sorted.json
 
 Or if you want to filter out all tweets before a certain date (for example, if a hashtag was used for another event before the one you're interested in):
 
-    % ./twarc.py --scrape --query "#somehashtag"
+    % ./twarc.py --query "#somehashtag"
     % utils/filter_date.py --mindate 1-may-2014 %23somehashtag-20141020122149.json > filtered.json
 
 Or if you want an HTML list of the clients used:
diff --git a/test.py b/test.py
index 0e1d789d..d207c78c 100644
--- a/test.py
+++ b/test.py
@@ -81,15 +81,6 @@ def test_paging():
     assert count == 500
 
 
-def test_scape():
-    # TODO: should try to get test w/ max_id working
-    count = 0
-    for tweet in twarc.scrape_tweets("twttr"):
-        count += 1
-        if count == 10:
-            break
-    assert count == 10
-
 def test_hydrate():
     ids = [
         "501064188211765249", "501064196642340864", "501064197632167936",
diff --git a/twarc.py b/twarc.py
index da6374a5..69b483f6 100755
--- a/twarc.py
+++ b/twarc.py
@@ -140,10 +140,8 @@ def ping(self, times=10):
                       self.remaining, self.reset)
 
 
-def search(q, since_id=None, max_id=None, scrape=True, only_ids=False):
-    """returns a generator for *all* search results. If you supply scrape,
-    twarc will attemp to dig back further in time by scraping
-    search.twitter.com and looking up individual tweets.
+def search(q, since_id=None, max_id=None, only_ids=False):
+    """returns a generator for *all* search results. 
     """
     logging.info("starting search for %s with since_id=%s and max_id=%s" %
                  (q, since_id, max_id))
@@ -154,10 +152,6 @@ def search(q, since_id=None, max_id=None, scrape=True, only_ids=False):
         for status in results:
             yield status
 
-    if scrape and not since_id:
-        for status in scrape_tweets(q, max_id=max_id):
-            yield status
-
 
 def stream(q):
     """Will return a generator for tweets that match a given query from
@@ -243,58 +237,6 @@ def archive(q, statuses):
         fh.write("\n")
 
 
-def scrape_tweets(query, max_id=None):
-    """
-    A kinda sneaky and slow way to retrieve older tweets, now that search on
-    the Twitter website extends back in time, even if the API does not.
-    """
-    for tweet in hydrate(scrape_tweet_ids(query, max_id)):
-        yield tweet
-
-
-def scrape_tweet_ids(query, max_id):
-    cursor = None
-    url = 'https://twitter.com/i/search/timeline?'
-    q = {
-        "q": query,
-        'f': 'realtime',
-        "src": "typd",
-        "include_available_features": 1,
-        "include_entities": 1,
-        "oldest_unread_id": max_id
-    }
-
-    while True:
-        logging.info("scraping tweets with id < %s", max_id)
-        q["last_note_ts"] = calendar.timegm(time.gmtime())
-        if cursor:
-            q["oldest_unread_id"] = 0
-            q["scroll_cursor"] = cursor
-
-        logging.debug("scraping %s", url + "?" + urlencode(q))
-        r = requests.get(url, headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}, params=q)
-        s = json.loads(r.content)
-
-        html = s["items_html"]
-        tweet_ids = re.findall(r'<a href=\"/.+/status/(\d+)', html)
-        logging.info("scraped tweet ids: %s", tweet_ids)
-
-        if len(tweet_ids) == 0:
-            logging.debug("no more tweet ids: %s", html)
-            raise StopIteration
-
-        for tweet_id in tweet_ids:
-            max_id = tweet_id
-            yield tweet_id
-
-        # seems to fetch more tweets when we sleep a random amount of time?
-        seconds = random.randint(0, 3)
-        logging.debug("sleeping for %s" % seconds)
-        time.sleep(seconds)
-
-        cursor = s['scroll_cursor']
-
-
 def hydrate(tweet_ids):
     """
     Give hydrate a list or generator of Twitter IDs and you get back 
@@ -328,10 +270,6 @@ def hydrate(tweet_ids):
     parser = argparse.ArgumentParser("twarc")
     parser.add_argument("--query", dest="query", action="store",
                         help="query to use to filter Twitter results")
-    parser.add_argument("--scrape", dest="scrape", action="store_true",
-                        help='attempt to scrape tweets from '
-                        'search.twitter.com for tweets not available via'
-                        'Twitter\'s search REST API')
     parser.add_argument("--max_id", dest="max_id", action="store",
                         help="maximum tweet id to fetch")
     parser.add_argument("--since_id", dest="since_id", action="store",
@@ -361,7 +299,6 @@ def hydrate(tweet_ids):
             args.query,
             since_id=since_id,
             max_id=args.max_id,
-            scrape=args.scrape
         )
 
     if args.query: