Skip to content

Commit

Permalink
fixes #35
Browse files Browse the repository at this point in the history
  • Loading branch information
edsu committed Dec 11, 2014
1 parent 5a94036 commit 0db7ce8
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 87 deletions.
16 changes: 3 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,6 @@ to share Twitter IDs instead. You can use twarc to "hydrate" them:

twarc.py --hydrate ids.txt > tweets.json

### Scrape Mode

The first time you fetch tweets for a query if you pass the --scrape option
it will use [search.twitter.com](http://search.twitter.com) to discover tweet
IDs, and then use the Twitter REST API to fetch the JSON for each tweet. This
is an expensive operation because each ID needs to be fetched from the API
which counts as a request against your quota.

[Twitter Search](http://search.twitter.com) [now supports](http://blog.twitter.com/2013/02/now-showing-older-tweets-in-search.html) drilling backwards in time, past the week cutoff of the REST API. Since individual tweets are still retrieved with the REST API, rate limits apply--so this is quite a slow process. Still, if you are willing to let it run for a while it can be useful to query for older tweets, until the official search REST API supports a more historical perspective.

### Use as a Library

If you want you can use twarc to get a stream of tweets from a search as JSON
Expand Down Expand Up @@ -106,17 +96,17 @@ Or if you want to output [GeoJSON](http://geojson.org/) from tweets where geo co

Or if you have duplicate tweets in your JSON, deduplicate using:

% ./twarc.py --scrape --query nasa
% ./twarc.py --query nasa
% utils/deduplicate.py nasa-20130306102105.json > deduped.json

Or if you want to sort by ID, which is analogous to sorting by time:

% ./twarc.py --scrape --query nasa
% ./twarc.py --query nasa
% utils/sort_by_id.py nasa-20130306102105.json > sorted.json

Or if you want to filter out all tweets before a certain date (for example, if a hashtag was used for another event before the one you're interested in):

% ./twarc.py --scrape --query "#somehashtag"
% ./twarc.py --query "#somehashtag"
% utils/filter_date.py --mindate 1-may-2014 %23somehashtag-20141020122149.json > filtered.json

Or if you want an HTML list of the clients used:
Expand Down
9 changes: 0 additions & 9 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,6 @@ def test_paging():
assert count == 500


def test_scape():
# TODO: should try to get test w/ max_id working
count = 0
for tweet in twarc.scrape_tweets("twttr"):
count += 1
if count == 10:
break
assert count == 10

def test_hydrate():
ids = [
"501064188211765249", "501064196642340864", "501064197632167936",
Expand Down
67 changes: 2 additions & 65 deletions twarc.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,8 @@ def ping(self, times=10):
self.remaining, self.reset)


def search(q, since_id=None, max_id=None, scrape=True, only_ids=False):
"""returns a generator for *all* search results. If you supply scrape,
twarc will attemp to dig back further in time by scraping
search.twitter.com and looking up individual tweets.
def search(q, since_id=None, max_id=None, only_ids=False):
"""returns a generator for *all* search results.
"""
logging.info("starting search for %s with since_id=%s and max_id=%s" %
(q, since_id, max_id))
Expand All @@ -154,10 +152,6 @@ def search(q, since_id=None, max_id=None, scrape=True, only_ids=False):
for status in results:
yield status

if scrape and not since_id:
for status in scrape_tweets(q, max_id=max_id):
yield status


def stream(q):
"""Will return a generator for tweets that match a given query from
Expand Down Expand Up @@ -243,58 +237,6 @@ def archive(q, statuses):
fh.write("\n")


def scrape_tweets(query, max_id=None):
"""
A kinda sneaky and slow way to retrieve older tweets, now that search on
the Twitter website extends back in time, even if the API does not.
"""
for tweet in hydrate(scrape_tweet_ids(query, max_id)):
yield tweet


def scrape_tweet_ids(query, max_id):
cursor = None
url = 'https://twitter.com/i/search/timeline?'
q = {
"q": query,
'f': 'realtime',
"src": "typd",
"include_available_features": 1,
"include_entities": 1,
"oldest_unread_id": max_id
}

while True:
logging.info("scraping tweets with id < %s", max_id)
q["last_note_ts"] = calendar.timegm(time.gmtime())
if cursor:
q["oldest_unread_id"] = 0
q["scroll_cursor"] = cursor

logging.debug("scraping %s", url + "?" + urlencode(q))
r = requests.get(url, headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}, params=q)
s = json.loads(r.content)

html = s["items_html"]
tweet_ids = re.findall(r'<a href=\"/.+/status/(\d+)', html)
logging.info("scraped tweet ids: %s", tweet_ids)

if len(tweet_ids) == 0:
logging.debug("no more tweet ids: %s", html)
raise StopIteration

for tweet_id in tweet_ids:
max_id = tweet_id
yield tweet_id

# seems to fetch more tweets when we sleep a random amount of time?
seconds = random.randint(0, 3)
logging.debug("sleeping for %s" % seconds)
time.sleep(seconds)

cursor = s['scroll_cursor']


def hydrate(tweet_ids):
"""
Give hydrate a list or generator of Twitter IDs and you get back
Expand Down Expand Up @@ -328,10 +270,6 @@ def hydrate(tweet_ids):
parser = argparse.ArgumentParser("twarc")
parser.add_argument("--query", dest="query", action="store",
help="query to use to filter Twitter results")
parser.add_argument("--scrape", dest="scrape", action="store_true",
help='attempt to scrape tweets from '
'search.twitter.com for tweets not available via'
'Twitter\'s search REST API')
parser.add_argument("--max_id", dest="max_id", action="store",
help="maximum tweet id to fetch")
parser.add_argument("--since_id", dest="since_id", action="store",
Expand Down Expand Up @@ -361,7 +299,6 @@ def hydrate(tweet_ids):
args.query,
since_id=since_id,
max_id=args.max_id,
scrape=args.scrape
)

if args.query:
Expand Down

0 comments on commit 0db7ce8

Please sign in to comment.