From 0db7ce8de3284f8be460796dd0d111a027b04704 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 11 Dec 2014 12:55:40 -0500 Subject: [PATCH] fixes #35 --- README.md | 16 +++---------- test.py | 9 -------- twarc.py | 67 ++----------------------------------------------------- 3 files changed, 5 insertions(+), 87 deletions(-) diff --git a/README.md b/README.md index 40f63828..4baf641d 100644 --- a/README.md +++ b/README.md @@ -39,16 +39,6 @@ to share Twitter IDs instead. You can use twarc to "hydrate" them: twarc.py --hydrate ids.txt > tweets.json -### Scrape Mode - -The first time you fetch tweets for a query if you pass the --scrape option -it will use [search.twitter.com](http://search.twitter.com) to discover tweet -IDs, and then use the Twitter REST API to fetch the JSON for each tweet. This -is an expensive operation because each ID needs to be fetched from the API -which counts as a request against your quota. - -[Twitter Search](http://search.twitter.com) [now supports](http://blog.twitter.com/2013/02/now-showing-older-tweets-in-search.html) drilling backwards in time, past the week cutoff of the REST API. Since individual tweets are still retrieved with the REST API, rate limits apply--so this is quite a slow process. Still, if you are willing to let it run for a while it can be useful to query for older tweets, until the official search REST API supports a more historical perspective. - ### Use as a Library If you want you can use twarc to get a stream of tweets from a search as JSON @@ -106,17 +96,17 @@ Or if you want to output [GeoJSON](http://geojson.org/) from tweets where geo co Or if you have duplicate tweets in your JSON, deduplicate using: - % ./twarc.py --scrape --query nasa + % ./twarc.py --query nasa % utils/deduplicate.py nasa-20130306102105.json > deduped.json Or if you want to sort by ID, which is analogous to sorting by time: - % ./twarc.py --scrape --query nasa + % ./twarc.py --query nasa % utils/sort_by_id.py nasa-20130306102105.json > sorted.json Or if you want to filter out all tweets before a certain date (for example, if a hashtag was used for another event before the one you're interested in): - % ./twarc.py --scrape --query "#somehashtag" + % ./twarc.py --query "#somehashtag" % utils/filter_date.py --mindate 1-may-2014 %23somehashtag-20141020122149.json > filtered.json Or if you want an HTML list of the clients used: diff --git a/test.py b/test.py index 0e1d789d..d207c78c 100644 --- a/test.py +++ b/test.py @@ -81,15 +81,6 @@ def test_paging(): assert count == 500 -def test_scape(): - # TODO: should try to get test w/ max_id working - count = 0 - for tweet in twarc.scrape_tweets("twttr"): - count += 1 - if count == 10: - break - assert count == 10 - def test_hydrate(): ids = [ "501064188211765249", "501064196642340864", "501064197632167936", diff --git a/twarc.py b/twarc.py index da6374a5..69b483f6 100755 --- a/twarc.py +++ b/twarc.py @@ -140,10 +140,8 @@ def ping(self, times=10): self.remaining, self.reset) -def search(q, since_id=None, max_id=None, scrape=True, only_ids=False): - """returns a generator for *all* search results. If you supply scrape, - twarc will attemp to dig back further in time by scraping - search.twitter.com and looking up individual tweets. +def search(q, since_id=None, max_id=None, only_ids=False): + """returns a generator for *all* search results. """ logging.info("starting search for %s with since_id=%s and max_id=%s" % (q, since_id, max_id)) @@ -154,10 +152,6 @@ def search(q, since_id=None, max_id=None, scrape=True, only_ids=False): for status in results: yield status - if scrape and not since_id: - for status in scrape_tweets(q, max_id=max_id): - yield status - def stream(q): """Will return a generator for tweets that match a given query from @@ -243,58 +237,6 @@ def archive(q, statuses): fh.write("\n") -def scrape_tweets(query, max_id=None): - """ - A kinda sneaky and slow way to retrieve older tweets, now that search on - the Twitter website extends back in time, even if the API does not. - """ - for tweet in hydrate(scrape_tweet_ids(query, max_id)): - yield tweet - - -def scrape_tweet_ids(query, max_id): - cursor = None - url = 'https://twitter.com/i/search/timeline?' - q = { - "q": query, - 'f': 'realtime', - "src": "typd", - "include_available_features": 1, - "include_entities": 1, - "oldest_unread_id": max_id - } - - while True: - logging.info("scraping tweets with id < %s", max_id) - q["last_note_ts"] = calendar.timegm(time.gmtime()) - if cursor: - q["oldest_unread_id"] = 0 - q["scroll_cursor"] = cursor - - logging.debug("scraping %s", url + "?" + urlencode(q)) - r = requests.get(url, headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}, params=q) - s = json.loads(r.content) - - html = s["items_html"] - tweet_ids = re.findall(r'