-
Notifications
You must be signed in to change notification settings - Fork 8
/
crawler_api.py
40 lines (30 loc) · 1.36 KB
/
crawler_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import praw
import logging
import argparse
from distutils.dir_util import mkpath
from praw.helpers import submission_stream
from crawler_utils import save_submission
def get_as_much_stuff_as_possible(storage_dir):
mkpath(storage_dir, mode=0755)
r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
for method_name in ["get_hot", "get_new", "get_top_from_all", "get_top_from_week",
"get_top_from_month", "get_top_from_year", "get_top_from_day",
"get_top_from_hour"]:
method = getattr(r.get_subreddit('learnprogramming'), method_name)
submissions = method(limit=1000)
for s in submissions:
save_submission(s, storage_dir)
def crawl_continuously(storage_dir):
r = praw.Reddit(user_agent='SearchingReddit project 0.2 by /u/godlikesme')
for s in submission_stream(r, "learnprogramming"):
s.replace_more_comments(limit=None)
save_submission(s, storage_dir)
def main():
logging.getLogger().setLevel(logging.DEBUG)
parser = argparse.ArgumentParser(description='Crawl /r/learnprogramming using api')
parser.add_argument("--storage_dir", dest="storage_dir", required=True)
args = parser.parse_args()
get_as_much_stuff_as_possible(args.storage_dir)
crawl_continuously(args.storage_dir)
if __name__ == "__main__":
main()