From 7676d903a3b5205c3f583279243b2266d6dd5d38 Mon Sep 17 00:00:00 2001 From: GeorgBerner Date: Thu, 3 Mar 2016 14:12:44 +0100 Subject: [PATCH 1/3] Added python scripts for google group download and preparation --- codeface/google_groups_py/crawler.py | 271 +++++++++++++++++++++ codeface/google_groups_py/post_download.py | 111 +++++++++ 2 files changed, 382 insertions(+) create mode 100755 codeface/google_groups_py/crawler.py create mode 100755 codeface/google_groups_py/post_download.py diff --git a/codeface/google_groups_py/crawler.py b/codeface/google_groups_py/crawler.py new file mode 100755 index 00000000..5471a152 --- /dev/null +++ b/codeface/google_groups_py/crawler.py @@ -0,0 +1,271 @@ +import re +import os.path +import urllib2 +import time +import sys +import concurrent.futures + + +class Crawler(object): + + def __init__(self): + self.group = "" + self.output = "" + self.link_topic = "https://groups.google.com/forum/?_escaped_fragment_=topic/" + self.link_msg = "https://groups.google.com/forum/message/raw?msg=" + self.link_forum = "https://groups.google.com/forum/?_escaped_fragment_=forum/" + self.opener = urllib2.build_opener() + self.opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox')] + self.current_last_post_date = "00.00.00" + self.max_worker_threads = 1 + self.group_is_set = False + self.output_is_set = False + self.start_date_is_set = False + print "----------------------------------------------------" + print "------- Google Group Crawler -------\n" + + # end execution if arguments aren't complete or help is displayed + def handle_arguments(self, argv): + if len(argv) is 1: + self.print_help() + sys.exit(0) + for i in xrange(len(argv)): + if argv[i] == "-h" or argv[i] == "--h" or argv[i] == "-help": + self.print_help() + sys.exit(0) + if argv[i] == "-output": + self.set_output(argv[i+1]) + self.output_is_set = True + if argv[i] == "-group": + self.set_group(argv[i+1]) + self.group_is_set = True + if argv[i] == "-threads": + self.set_max_worker_threads(argv[i+1]) + if argv[i] == "-start": + self.set_current_last_post_date(argv[i+1]) + if self.check_date_format(self.current_last_post_date) is False: + print"Error: date wrong format" + self.print_help() + sys.exit(0) + self.start_date_is_set = True + if self.group_is_set is False: + print "Error: missing group" + self.print_help() + sys.exit(0) + if self.output_is_set is False: + print "Error: missing output" + self.print_help() + sys.exit(0) + + @staticmethod + def print_help(): + print "################################################################################" + print "Help output:" + print "################################################################################" + print "crawler.py -output \"path\" -group \"groupname\" [-threads \"#threads\",\n -start \"DD.MM.YY\"]" + print "################################################################################" + print "-output: output path for ggc" + print "-group: name of the google group" + print "-threads: number of threads for simultaneous download (default = 1) (only for dl)" + print "-start: date from which all messages of a group will be downloaded, \"DD.MM.YY\" (only for dl)" + print "" + print "If group has been downloaded before only new messages will be downloaded" + print "To download the entire group again delete the threads.txt file or the complete directory" + print "################################################################################" + + # checks if date is in format DD.MM.YY + @staticmethod + def check_date_format(date): + result = re.findall("\\d\\d.\\d\\d.\\d\\d", date) + if len(result) is 1: + return True + return False + + def set_group(self, group): + self.group = group + + def set_current_last_post_date(self, start): + self.current_last_post_date = start + + def set_output(self, output): + if str(output).endswith("/") or str(output).endswith("\\"): + self.output = output + else: + self.output = output + "/" + + def set_max_worker_threads(self, max_worker_threads): + self.max_worker_threads = max_worker_threads + + def start(self): + try: + self.opener.open(self.link_forum + self.group) + except urllib2.HTTPError: + print "\nError: could not locate group" + sys.exit(0) + if self.start_date_is_set is True: + self.get_group_from_start() + if os.path.isfile(self.output + "threads.txt"): + self.update() + else: + self.get_complete_group() + + def get_group_from_start(self): + self.make_dir() + with open(self.output + "threads.txt", 'w') as file_threads: + file_threads.write("XXX," + self.current_last_post_date + ",\n") + self.get_threads() + + def get_complete_group(self): + print "Entered group: " + self.group + print "complete group will be downloaded\n" + self.make_dir() + self.get_threads() + + def update(self): + print "update" + # load threads file, get first last_post_date + first_line = open(self.output + "threads.txt", "r").readline() + self.current_last_post_date = first_line.split(",")[1] + # compare first last_post_date with last_post_date from file + if self.threads_have_changes(self.output + "/threads.txt"): + self.get_threads() + else: + print "no changes in " + self.group + # if equal -> done(no new posts... possibly posts done on the same day as last download...) + + def get_threads(self): + html = self.opener.open(self.link_forum + self.group) + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_worker_threads) as executor: + with open(self.output + "threads.txt", 'w') as file_threads: + while True: + next_page = self.get_threads_and_date(html, file_threads, executor) + if next_page == "": + break + html = self.opener.open(next_page) + + def get_threads_and_date(self, html, file_threads, executor): + next_page = "" + current_id = "" + for line in html: + last_post_date = re.findall("(?<=\"lastPostDate\">)([0-9\.:]*)", line) + if len(last_post_date) > 0: + if ":" in last_post_date[0]: + last_post_date[0] = time.strftime("%d.%m.%y") + if self.compare_date(last_post_date[0], self.current_last_post_date) == -1: + file_threads.write("XXXX," + self.current_last_post_date + ",\n") + next_page = "" + break + if last_post_date[0] == self.current_last_post_date: + return "" + file_threads.write(current_id + "," + last_post_date[0] + ",\n") + executor.submit(self.get_messages, current_id) + # self.get_messages(current_id) + temp = re.findall("(href=\")(.*?)(\")", line) + if len(temp) == 0: + continue + topic_id = re.findall("(?<=" + self.group + "/)(.*)", temp[0][1]) + if "topic" in temp[0][1]: + current_id = topic_id[0] + if "forum" in temp[0][1]: + next_page = temp[0][1] + return next_page + + # gets executed in own thread + def get_messages(self, topic_id): + counter = 0 + print "getting thread " + topic_id + try: + html = self.opener.open(self.link_topic + self.group + "/" + topic_id) + except urllib2.HTTPError: + print "skipping deleted thread " + topic_id + return + msg_id = "" + for line in html: + last_post_date = re.findall("(?<=\"lastPostDate\">)([0-9\.]*)", line) + if len(last_post_date) > 0: + if len(self.current_last_post_date) > 0: + if self.compare_date(last_post_date[0], self.current_last_post_date) < 1: + continue + self.get_mbox(topic_id, msg_id) + counter += 1 + temp = re.findall("(href=\")(.*?)(\")", line) + if len(temp) == 0: + continue + if "/msg/" + self.group + "/" in temp[0][1]: + msg_id = re.findall("(?<=" + topic_id + "/)(.*)", temp[0][1])[0] + if len(last_post_date) > 0: + if last_post_date == self.current_last_post_date: + print str(counter) + " messages for topic " + topic_id + " downloaded." + return "" + print str(counter) + " messages for topic " + topic_id + " downloaded." + + def get_mbox(self, topic_id, msg_id): + print "\tdownloading msg: " + msg_id + try: + mbox = self.opener.open(self.link_msg + self.group + "/" + topic_id + "/" + msg_id).read() + except urllib2.HTTPError: + print "skipping deleted message " + msg_id + return + with open(self.output + "mbox/" + msg_id + ".mbox", "w") as file_mbox: + file_mbox.write(mbox) + + @staticmethod + def compare_date(date1, date2): + # date format: dd.mm.yy + date1_split = date1.split(".") + date2_split = date2.split(".") + if int(date1_split[2]) == int(date2_split[2]): + # year1 == year2 + if int(date1_split[1]) == int(date2_split[1]): + # month1 == month + if int(date1_split[0]) < int(date2_split[0]): + # day1 < day2 + return -1 + if int(date1_split[0]) == int(date2_split[0]): + # day1 == day2 + return 0 + else: + # day1 > day2 + return 1 + if int(date1_split[1]) < int(date2_split[1]): + return -1 + else: + return 1 + if int(date1_split[2]) < int(date2_split[2]): + return -1 + else: + return 1 + + def threads_have_changes(self, path_file): + html = self.opener.open(self.link_forum + self.group) + for line in html: + if "lastPostDate" in line: + last_post_date = re.findall("(?<=\"lastPostDate\">)([0-9.:]*)", line)[0] + if ":" in last_post_date: + last_post_date = time.strftime("%d.%m.%y,\n") + if last_post_date == self.current_last_post_date: + with open(path_file, "w") as file_threads: + # to save the last update date for next update try + file_threads.write("XXXX," + self.current_last_post_date + ",\n") + return False + return True + + # creates subdirectories if they don't exist + def make_dir(self): + if not os.path.isdir(self.output + "mbox"): + print "creating directories\n" + os.makedirs(self.output + "mbox") + + +def main(): + argv = sys.argv + # argv = ['crawlerThreads.py', '-output', 'output/', '-group', 'phaser3-dev', '-threads', '40'] + # argv = ['crawlerThreads.py', '-output', 'output/', '-group', 'artspace', '-threads', '10'] + # argv = ['crawlerThreads.py', '-output', 'output/', '-group', 'artspace', '-threads', '10', '-start', '01.01.10'] + # argv = ['crawlerThreads.py', '-output', 'output/pants-devel2', '-group', 'pants-devel', '-threads', '200'] + crawler = Crawler() + crawler.handle_arguments(argv) + crawler.start() + + +main() diff --git a/codeface/google_groups_py/post_download.py b/codeface/google_groups_py/post_download.py new file mode 100755 index 00000000..922955db --- /dev/null +++ b/codeface/google_groups_py/post_download.py @@ -0,0 +1,111 @@ +import os.path +import sys +import re + + +class PostDownload(object): + + def __init__(self): + self.group = "" + self.output = "" + self.group_is_set = False + self.output_is_set = False + + @staticmethod + def print_help(): + print "################################################################################" + print "Help output:" + print "################################################################################" + print "post_download.py -output \"path\" -group \"groupname\"" + print "# # # # # # # # # # # # # # # # # # # # # # # # # # # " + print "-output: output path pointing to the directory containing the group's directory" + print "-group: name of the google group" + print "" + print "Multiple mbox files will be combined into one file" + print "Certain lines will be edited to be understandable by Codeface" + print "example: \"post_download.py -output output\\ -group pants\"" + print "################################################################################" + + # end execution if arguments aren't complete or help is displayed + def handle_arguments(self, argv): + if len(argv) is 1: + self.print_help() + sys.exit(0) + for i in xrange(len(argv)): + if argv[i] == "-h" or argv[i] == "--h" or argv[i] == "-help": + self.print_help() + sys.exit(0) + if argv[i] == "-output": + self.set_output(argv[i+1]) + self.output_is_set = True + if argv[i] == "-group": + self.group = argv[i+1] + self.group_is_set = True + if self.group_is_set is False: + print "Error: missing group" + self.print_help() + sys.exit(0) + if self.output_is_set is False: + print "Error: missing output" + self.print_help() + sys.exit(0) + + def set_output(self, output): + if str(output).endswith("/") or str(output).endswith("\\"): + self.output = output + else: + self.output = output + "/" + + def start(self): + self.merge_to_single_file() + + def merge_to_single_file(self): + with open(self.output + self.group + ".mbox", "w") as final_mbox: + for root, dirs, files in os.walk(self.output + "/mbox"): + for file in files: + if file.endswith(".mbox"): + with open(self.output + "mbox/" + file, "r") as single_mbox: + self.fix_content(single_mbox, final_mbox) + final_mbox.write("\n\n\n") + + def fix_content(self, single_mbox, final_mbox): + text = single_mbox.read().split("\n") + if "X-Received " in text[0]: + text[0] = "From GoogleGroups" + else: + text = ["From GoogleGroups"] + text + for line in text: + if "From: " in line: + line = self.handle_from_line(line) + final_mbox.write(line) + + @staticmethod + def handle_from_line(line): + from_line = re.findall("(?<=From: )\"?[ A-Za-z]*", line) + if len(from_line) == 1: + name = from_line[0].replace('"', '') + if name.endswith(" "): + name = name[:-1] + email = re.findall("<.*>", line) + if len(email) > 0: + if "..." in email[0]: + email_suffix = re.findall("@[a-zA-Z0-1\-.]*", line) + if len(email_suffix) != 0: + email_suffix = email_suffix[0] + else: + email_suffix = "@unknown.com" + return "From: " + name + " <" + name.replace(" ", "_") + email_suffix + ">\n" + else: + return "From: " + name + " " + email[0] + "\n" + return line + + +def main(): + argv = sys.argv + argv = ["-output", "../test/pants", "-group", "pants-devel"] + post_download = PostDownload() + post_download.handle_arguments(argv) + post_download.start() + + +main() \ No newline at end of file From ca69124d9c94fd2aed6d2000c25aa6ada4588ebc Mon Sep 17 00:00:00 2001 From: Georg Berner Date: Mon, 11 Apr 2016 00:37:40 +0200 Subject: [PATCH 2/3] Refractored and renamed the Google Groups Group Download (now named g3d) added UnitTests to run g3d use the run_g3d.py file --- codeface/google_groups_py/crawler.py | 271 ----------------- codeface/google_groups_py/g3d.py | 334 +++++++++++++++++++++ codeface/google_groups_py/post_download.py | 111 ------- codeface/google_groups_py/run_g3d.py | 12 + codeface/google_groups_py/utest_g3d.py | 228 ++++++++++++++ 5 files changed, 574 insertions(+), 382 deletions(-) delete mode 100755 codeface/google_groups_py/crawler.py create mode 100644 codeface/google_groups_py/g3d.py delete mode 100755 codeface/google_groups_py/post_download.py create mode 100644 codeface/google_groups_py/run_g3d.py create mode 100644 codeface/google_groups_py/utest_g3d.py diff --git a/codeface/google_groups_py/crawler.py b/codeface/google_groups_py/crawler.py deleted file mode 100755 index 5471a152..00000000 --- a/codeface/google_groups_py/crawler.py +++ /dev/null @@ -1,271 +0,0 @@ -import re -import os.path -import urllib2 -import time -import sys -import concurrent.futures - - -class Crawler(object): - - def __init__(self): - self.group = "" - self.output = "" - self.link_topic = "https://groups.google.com/forum/?_escaped_fragment_=topic/" - self.link_msg = "https://groups.google.com/forum/message/raw?msg=" - self.link_forum = "https://groups.google.com/forum/?_escaped_fragment_=forum/" - self.opener = urllib2.build_opener() - self.opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox')] - self.current_last_post_date = "00.00.00" - self.max_worker_threads = 1 - self.group_is_set = False - self.output_is_set = False - self.start_date_is_set = False - print "----------------------------------------------------" - print "------- Google Group Crawler -------\n" - - # end execution if arguments aren't complete or help is displayed - def handle_arguments(self, argv): - if len(argv) is 1: - self.print_help() - sys.exit(0) - for i in xrange(len(argv)): - if argv[i] == "-h" or argv[i] == "--h" or argv[i] == "-help": - self.print_help() - sys.exit(0) - if argv[i] == "-output": - self.set_output(argv[i+1]) - self.output_is_set = True - if argv[i] == "-group": - self.set_group(argv[i+1]) - self.group_is_set = True - if argv[i] == "-threads": - self.set_max_worker_threads(argv[i+1]) - if argv[i] == "-start": - self.set_current_last_post_date(argv[i+1]) - if self.check_date_format(self.current_last_post_date) is False: - print"Error: date wrong format" - self.print_help() - sys.exit(0) - self.start_date_is_set = True - if self.group_is_set is False: - print "Error: missing group" - self.print_help() - sys.exit(0) - if self.output_is_set is False: - print "Error: missing output" - self.print_help() - sys.exit(0) - - @staticmethod - def print_help(): - print "################################################################################" - print "Help output:" - print "################################################################################" - print "crawler.py -output \"path\" -group \"groupname\" [-threads \"#threads\",\n -start \"DD.MM.YY\"]" - print "################################################################################" - print "-output: output path for ggc" - print "-group: name of the google group" - print "-threads: number of threads for simultaneous download (default = 1) (only for dl)" - print "-start: date from which all messages of a group will be downloaded, \"DD.MM.YY\" (only for dl)" - print "" - print "If group has been downloaded before only new messages will be downloaded" - print "To download the entire group again delete the threads.txt file or the complete directory" - print "################################################################################" - - # checks if date is in format DD.MM.YY - @staticmethod - def check_date_format(date): - result = re.findall("\\d\\d.\\d\\d.\\d\\d", date) - if len(result) is 1: - return True - return False - - def set_group(self, group): - self.group = group - - def set_current_last_post_date(self, start): - self.current_last_post_date = start - - def set_output(self, output): - if str(output).endswith("/") or str(output).endswith("\\"): - self.output = output - else: - self.output = output + "/" - - def set_max_worker_threads(self, max_worker_threads): - self.max_worker_threads = max_worker_threads - - def start(self): - try: - self.opener.open(self.link_forum + self.group) - except urllib2.HTTPError: - print "\nError: could not locate group" - sys.exit(0) - if self.start_date_is_set is True: - self.get_group_from_start() - if os.path.isfile(self.output + "threads.txt"): - self.update() - else: - self.get_complete_group() - - def get_group_from_start(self): - self.make_dir() - with open(self.output + "threads.txt", 'w') as file_threads: - file_threads.write("XXX," + self.current_last_post_date + ",\n") - self.get_threads() - - def get_complete_group(self): - print "Entered group: " + self.group - print "complete group will be downloaded\n" - self.make_dir() - self.get_threads() - - def update(self): - print "update" - # load threads file, get first last_post_date - first_line = open(self.output + "threads.txt", "r").readline() - self.current_last_post_date = first_line.split(",")[1] - # compare first last_post_date with last_post_date from file - if self.threads_have_changes(self.output + "/threads.txt"): - self.get_threads() - else: - print "no changes in " + self.group - # if equal -> done(no new posts... possibly posts done on the same day as last download...) - - def get_threads(self): - html = self.opener.open(self.link_forum + self.group) - with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_worker_threads) as executor: - with open(self.output + "threads.txt", 'w') as file_threads: - while True: - next_page = self.get_threads_and_date(html, file_threads, executor) - if next_page == "": - break - html = self.opener.open(next_page) - - def get_threads_and_date(self, html, file_threads, executor): - next_page = "" - current_id = "" - for line in html: - last_post_date = re.findall("(?<=\"lastPostDate\">)([0-9\.:]*)", line) - if len(last_post_date) > 0: - if ":" in last_post_date[0]: - last_post_date[0] = time.strftime("%d.%m.%y") - if self.compare_date(last_post_date[0], self.current_last_post_date) == -1: - file_threads.write("XXXX," + self.current_last_post_date + ",\n") - next_page = "" - break - if last_post_date[0] == self.current_last_post_date: - return "" - file_threads.write(current_id + "," + last_post_date[0] + ",\n") - executor.submit(self.get_messages, current_id) - # self.get_messages(current_id) - temp = re.findall("(href=\")(.*?)(\")", line) - if len(temp) == 0: - continue - topic_id = re.findall("(?<=" + self.group + "/)(.*)", temp[0][1]) - if "topic" in temp[0][1]: - current_id = topic_id[0] - if "forum" in temp[0][1]: - next_page = temp[0][1] - return next_page - - # gets executed in own thread - def get_messages(self, topic_id): - counter = 0 - print "getting thread " + topic_id - try: - html = self.opener.open(self.link_topic + self.group + "/" + topic_id) - except urllib2.HTTPError: - print "skipping deleted thread " + topic_id - return - msg_id = "" - for line in html: - last_post_date = re.findall("(?<=\"lastPostDate\">)([0-9\.]*)", line) - if len(last_post_date) > 0: - if len(self.current_last_post_date) > 0: - if self.compare_date(last_post_date[0], self.current_last_post_date) < 1: - continue - self.get_mbox(topic_id, msg_id) - counter += 1 - temp = re.findall("(href=\")(.*?)(\")", line) - if len(temp) == 0: - continue - if "/msg/" + self.group + "/" in temp[0][1]: - msg_id = re.findall("(?<=" + topic_id + "/)(.*)", temp[0][1])[0] - if len(last_post_date) > 0: - if last_post_date == self.current_last_post_date: - print str(counter) + " messages for topic " + topic_id + " downloaded." - return "" - print str(counter) + " messages for topic " + topic_id + " downloaded." - - def get_mbox(self, topic_id, msg_id): - print "\tdownloading msg: " + msg_id - try: - mbox = self.opener.open(self.link_msg + self.group + "/" + topic_id + "/" + msg_id).read() - except urllib2.HTTPError: - print "skipping deleted message " + msg_id - return - with open(self.output + "mbox/" + msg_id + ".mbox", "w") as file_mbox: - file_mbox.write(mbox) - - @staticmethod - def compare_date(date1, date2): - # date format: dd.mm.yy - date1_split = date1.split(".") - date2_split = date2.split(".") - if int(date1_split[2]) == int(date2_split[2]): - # year1 == year2 - if int(date1_split[1]) == int(date2_split[1]): - # month1 == month - if int(date1_split[0]) < int(date2_split[0]): - # day1 < day2 - return -1 - if int(date1_split[0]) == int(date2_split[0]): - # day1 == day2 - return 0 - else: - # day1 > day2 - return 1 - if int(date1_split[1]) < int(date2_split[1]): - return -1 - else: - return 1 - if int(date1_split[2]) < int(date2_split[2]): - return -1 - else: - return 1 - - def threads_have_changes(self, path_file): - html = self.opener.open(self.link_forum + self.group) - for line in html: - if "lastPostDate" in line: - last_post_date = re.findall("(?<=\"lastPostDate\">)([0-9.:]*)", line)[0] - if ":" in last_post_date: - last_post_date = time.strftime("%d.%m.%y,\n") - if last_post_date == self.current_last_post_date: - with open(path_file, "w") as file_threads: - # to save the last update date for next update try - file_threads.write("XXXX," + self.current_last_post_date + ",\n") - return False - return True - - # creates subdirectories if they don't exist - def make_dir(self): - if not os.path.isdir(self.output + "mbox"): - print "creating directories\n" - os.makedirs(self.output + "mbox") - - -def main(): - argv = sys.argv - # argv = ['crawlerThreads.py', '-output', 'output/', '-group', 'phaser3-dev', '-threads', '40'] - # argv = ['crawlerThreads.py', '-output', 'output/', '-group', 'artspace', '-threads', '10'] - # argv = ['crawlerThreads.py', '-output', 'output/', '-group', 'artspace', '-threads', '10', '-start', '01.01.10'] - # argv = ['crawlerThreads.py', '-output', 'output/pants-devel2', '-group', 'pants-devel', '-threads', '200'] - crawler = Crawler() - crawler.handle_arguments(argv) - crawler.start() - - -main() diff --git a/codeface/google_groups_py/g3d.py b/codeface/google_groups_py/g3d.py new file mode 100644 index 00000000..38c9045c --- /dev/null +++ b/codeface/google_groups_py/g3d.py @@ -0,0 +1,334 @@ +import re +import os.path +import urllib2 +import time +import sys +import threading +import concurrent.futures + + +class Crawler(object): + + def __init__(self): + self.group = "" + self.output = "" + self.link_topic = "https://groups.google.com/forum/?_escaped_fragment_=topic/" + self.link_msg = "https://groups.google.com/forum/message/raw?msg=" + self.link_forum = "https://groups.google.com/forum/?_escaped_fragment_=forum/" + self.opener = urllib2.build_opener() + self.opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox')] + self.current_last_post_date = "00.00.00" + self.max_worker_threads = 1 + self.group_is_set = False + self.output_is_set = False + self.start_date_is_set = False + self.verbose = False + self.execute = False + self.lock = threading.Lock() + self.counter = 0 + self.elapsed_time = 0 + print "------------------------g3d-------------------------" + print "------- Google Groups Group Downloader -----\n" + + # end execution if arguments aren't complete or help is displayed + def handle_arguments(self, argv): + if len(argv) is 1: + self.print_help() + return + for i in xrange(len(argv)): + if argv[i] == "-h" or argv[i] == "--h" or argv[i] == "-help": + self.print_help() + return + if argv[i] == "-v": + self.verbose = True + continue + if argv[i] == "-output": + if i+1 == len(argv): + break + self.set_output(argv[i+1]) + self.output_is_set = True + continue + if argv[i] == "-group": + if i+1 == len(argv): + break + self.group = argv[i+1] + self.group_is_set = True + continue + if argv[i] == "-threads": + if i+1 == len(argv): + print "Error: amount of threads missing" + return + try: + self.max_worker_threads = int(argv[i+1]) + except ValueError: + print "\nError: argument following -threads is not a number" + return + continue + if argv[i] == "-start": + if i+1 == len(argv): + print "Error: start date missing" + return + self.current_last_post_date = argv[i+1] + if self.check_date_format(self.current_last_post_date) is False: + print"Error: date wrong format" + return + self.start_date_is_set = True + continue + if self.group_is_set is False: + print "Error: missing group" + return + if self.output_is_set is False: + print "Warning: missing output" + self.set_output("./") + self.output_is_set = True + self.execute = True + + @staticmethod + def print_help(): + print "################################################################################" + print "Help output:" + print "################################################################################" + print "crawler.py -output \"path\" -group \"groupname\" [-threads \"#threads\",\n -start \"DD.MM.YY\"]" + print "################################################################################" + print "-output: output path for ggc" + print "-group: name of the google group" + print "-threads: number of threads for simultaneous download (default = 1) (only for dl)" + print "-start: date from which all messages of a group will be downloaded, \"DD.MM.YY\" (only for dl)" + print "-v: verbose output" + print "################################################################################" + + # checks if date is in format DD.MM.YY + @staticmethod + def check_date_format(date): + result = re.findall("^\\d\\d.\\d\\d.\\d\\d$", date) + if len(result) is 1: + return True + return False + + def set_output(self, output): + if str(output).endswith("/") or str(output).endswith("\\"): + self.output = output + else: + self.output = output + "/" + + def start(self): + start_time = time.time() + try: + self.opener.open(self.link_forum + self.group) + except urllib2.HTTPError: + print "\nError: could not locate group" + sys.exit(0) + if self.start_date_is_set is True: + self.get_group_from_start() + else: + self.get_complete_group() + self.elapsed_time = time.time()-start_time + + def get_group_from_start(self): + self.make_dir() + with open(self.output + self.group + ".mbox", "w"): + print "creating output file" + with open(self.output + "threads.txt", 'w') as file_threads: + file_threads.write("XXX," + self.current_last_post_date + ",\n") + self.get_threads() + + def get_complete_group(self): + print "Entered group: " + self.group + print "complete group will be downloaded\n" + self.make_dir() + with open(self.output + self.group + ".mbox", "w"): + print "creating output file" + if self.verbose is False: + print "starting download, please wait..." + self.get_threads() + + def get_threads(self): + try: + html = self.opener.open(self.link_forum + self.group) + except urllib2.HTTPError: + return + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_worker_threads) as executor: + while True: + next_page = self.get_threads_and_date(html, executor) + if next_page == "": + break + html = self.opener.open(next_page) + + def get_threads_and_date(self, html, executor): + next_page = "" + current_id = "" + for line in html: + last_post_date = self.search_last_post_date(line) + if len(last_post_date) > 0: + if ":" in last_post_date: + last_post_date = time.strftime("%d.%m.%y") + if self.compare_date(last_post_date, self.current_last_post_date) == -1: + next_page = "" + break + if last_post_date == self.current_last_post_date: + return "" + executor.submit(self.get_messages, current_id) + href = self.search_href(line) + if len(href) == 0: + continue + topic_id = self.search_topic_id(self.group, href) + if "topic" in href: + current_id = topic_id + if "forum" in href: + next_page = href + return next_page + + @staticmethod + def search_topic_id(group, line): + temp = re.findall("(?<=" + group + "/)(.*)", line) + if len(temp) == 0: + return "" + return temp[0] + + @staticmethod + def search_href(line): + temp = re.findall("(href=\")(.*?)(\")", line) + if len(temp) == 0: + return "" + return temp[0][1] + + @staticmethod + def search_last_post_date(line): + temp = re.findall("(?<=\"lastPostDate\">)([0-9\.:]*)", line) + if len(temp) == 0: + return "" + return temp[0] + + # gets executed in own thread + def get_messages(self, topic_id): + counter = 0 + if self.verbose is True: + print "getting thread " + topic_id + try: + html = self.opener.open(self.link_topic + self.group + "/" + topic_id) + except urllib2.HTTPError: + if self.verbose is True: + print "skipping deleted thread " + topic_id + return + msg_id = "" + for line in html: + last_post_date = self.search_last_post_date(line) + if len(last_post_date) > 0: + if len(self.current_last_post_date) > 0: + if self.compare_date(last_post_date, self.current_last_post_date) < 1: + continue + self.get_mbox(topic_id, msg_id) + counter += 1 + href = self.search_href(line) + if len(href) == 0: + continue + if "/msg/" + self.group + "/" in href: + msg_id = self.search_msg_id(topic_id, href) + if len(last_post_date) > 0: + if last_post_date == self.current_last_post_date: + if self.verbose is True: + print str(counter) + " messages for topic " + topic_id + " downloaded." + return "" + if self.verbose is True: + print str(counter) + " messages for topic " + topic_id + " downloaded." + + @staticmethod + def search_msg_id(topic_id, href): + return re.findall("(?<=" + topic_id + "/)(.*)", href)[0] + + def get_mbox(self, topic_id, msg_id): + if self.verbose is True: + print "\tdownloading -> topic: " + topic_id + " msg: " + msg_id + try: + mbox = self.opener.open(self.link_msg + self.group + "/" + topic_id + "/" + msg_id).read() + except urllib2.HTTPError: + if self.verbose is True: + print "skipping deleted message " + msg_id + return + self.add_to_mbox(mbox) + + @staticmethod + def compare_date(date1, date2): + # date format: dd.mm.yy + date1_split = date1.split(".") + date2_split = date2.split(".") + if int(date1_split[2]) == int(date2_split[2]): + # year1 == year2 + if int(date1_split[1]) == int(date2_split[1]): + # month1 == month + if int(date1_split[0]) < int(date2_split[0]): + # day1 < day2 + return -1 + if int(date1_split[0]) == int(date2_split[0]): + # day1 == day2 + return 0 + else: + # day1 > day2 + return 1 + if int(date1_split[1]) < int(date2_split[1]): + return -1 + else: + return 1 + if int(date1_split[2]) < int(date2_split[2]): + return -1 + else: + return 1 + + # creates subdirectories if they don't exist + def make_dir(self): + if not os.path.isdir(self.output): + print "creating directory\n" + os.makedirs(self.output) + + def add_to_mbox(self, raw): + clean_mbox = "\n\n\nFrom Google Groups\n" + self.check_mbox(raw) + with self.lock: + with open(self.output + self.group + ".mbox", "a") as final_mbox: + final_mbox.write(clean_mbox) + self.counter += 1 + + def check_mbox(self, raw): + text = raw.split("\n") + clean = "" + for line in text: + if line.startswith("From "): + line = " " + line + if line.startswith("From: "): + line = self.handle_from_line(line) + "\n" + clean += line + return clean + + @staticmethod + def search_name(line): + temp = re.findall("(?<=From: )\"?[ A-Za-z]*", line) + if len(temp) > 0: + name = temp[0].replace('"', '') + if name.endswith(" "): + name = name[:-1] + return name + return "" + + @staticmethod + def search_email(line, name): + email = re.findall("<.*>", line) + if len(email) > 0: + if "..." in email[0]: + email_suffix = re.findall("@[a-zA-Z0-1\-.]*", line) + if len(email_suffix) != 0: + email_suffix = email_suffix[0] + else: + email_suffix = "@unknown.com" + return "<" + name.replace(" ", "_") + email_suffix + ">" + return "" + + def handle_from_line(self, line): + name = self.search_name(line) + if len(name) != 0: + email = self.search_email(line, name) + if len(email) > 0: + return "From: " + name + " " + email + return line + + def statistic(self): + print "download complete" + print "g3d downloaded " + str(self.counter) + " messages in " + str(int(self.elapsed_time)) + " seconds." diff --git a/codeface/google_groups_py/post_download.py b/codeface/google_groups_py/post_download.py deleted file mode 100755 index 922955db..00000000 --- a/codeface/google_groups_py/post_download.py +++ /dev/null @@ -1,111 +0,0 @@ -import os.path -import sys -import re - - -class PostDownload(object): - - def __init__(self): - self.group = "" - self.output = "" - self.group_is_set = False - self.output_is_set = False - - @staticmethod - def print_help(): - print "################################################################################" - print "Help output:" - print "################################################################################" - print "post_download.py -output \"path\" -group \"groupname\"" - print "# # # # # # # # # # # # # # # # # # # # # # # # # # # " - print "-output: output path pointing to the directory containing the group's directory" - print "-group: name of the google group" - print "" - print "Multiple mbox files will be combined into one file" - print "Certain lines will be edited to be understandable by Codeface" - print "example: \"post_download.py -output output\\ -group pants\"" - print "################################################################################" - - # end execution if arguments aren't complete or help is displayed - def handle_arguments(self, argv): - if len(argv) is 1: - self.print_help() - sys.exit(0) - for i in xrange(len(argv)): - if argv[i] == "-h" or argv[i] == "--h" or argv[i] == "-help": - self.print_help() - sys.exit(0) - if argv[i] == "-output": - self.set_output(argv[i+1]) - self.output_is_set = True - if argv[i] == "-group": - self.group = argv[i+1] - self.group_is_set = True - if self.group_is_set is False: - print "Error: missing group" - self.print_help() - sys.exit(0) - if self.output_is_set is False: - print "Error: missing output" - self.print_help() - sys.exit(0) - - def set_output(self, output): - if str(output).endswith("/") or str(output).endswith("\\"): - self.output = output - else: - self.output = output + "/" - - def start(self): - self.merge_to_single_file() - - def merge_to_single_file(self): - with open(self.output + self.group + ".mbox", "w") as final_mbox: - for root, dirs, files in os.walk(self.output + "/mbox"): - for file in files: - if file.endswith(".mbox"): - with open(self.output + "mbox/" + file, "r") as single_mbox: - self.fix_content(single_mbox, final_mbox) - final_mbox.write("\n\n\n") - - def fix_content(self, single_mbox, final_mbox): - text = single_mbox.read().split("\n") - if "X-Received " in text[0]: - text[0] = "From GoogleGroups" - else: - text = ["From GoogleGroups"] + text - for line in text: - if "From: " in line: - line = self.handle_from_line(line) - final_mbox.write(line) - - @staticmethod - def handle_from_line(line): - from_line = re.findall("(?<=From: )\"?[ A-Za-z]*", line) - if len(from_line) == 1: - name = from_line[0].replace('"', '') - if name.endswith(" "): - name = name[:-1] - email = re.findall("<.*>", line) - if len(email) > 0: - if "..." in email[0]: - email_suffix = re.findall("@[a-zA-Z0-1\-.]*", line) - if len(email_suffix) != 0: - email_suffix = email_suffix[0] - else: - email_suffix = "@unknown.com" - return "From: " + name + " <" + name.replace(" ", "_") + email_suffix + ">\n" - else: - return "From: " + name + " " + email[0] + "\n" - return line - - -def main(): - argv = sys.argv - argv = ["-output", "../test/pants", "-group", "pants-devel"] - post_download = PostDownload() - post_download.handle_arguments(argv) - post_download.start() - - -main() \ No newline at end of file diff --git a/codeface/google_groups_py/run_g3d.py b/codeface/google_groups_py/run_g3d.py new file mode 100644 index 00000000..2ef20e06 --- /dev/null +++ b/codeface/google_groups_py/run_g3d.py @@ -0,0 +1,12 @@ +from g3d import Crawler +import sys + +def main(): + argv = sys.argv + crawler = Crawler() + crawler.handle_arguments(argv) + if crawler.execute is True: + crawler.start() + crawler.statistic() + +main() \ No newline at end of file diff --git a/codeface/google_groups_py/utest_g3d.py b/codeface/google_groups_py/utest_g3d.py new file mode 100644 index 00000000..4f8ddd21 --- /dev/null +++ b/codeface/google_groups_py/utest_g3d.py @@ -0,0 +1,228 @@ +from g3d import Crawler +import unittest + + +class argumentHandlingTests(unittest.TestCase): + # no errors + def test1(self): + argv = ["-group", "test_group", "-start", "11.11.11", "-output", "./", "-threads", "50"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == True + assert crawler.group_is_set == True + assert crawler.group == "test_group" + assert crawler.max_worker_threads == 50 + assert crawler.output_is_set == True + assert crawler.output == "./" + assert crawler.verbose == False + + # -output missing -> should work + def test2(self): + argv = ["-group", "test_group", "-threads", "50"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == True + assert crawler.output_is_set == True + assert crawler.output == "./" + + # -group missing + def test3(self): + argv = ["-output", "./", "-threads", "50"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.group_is_set == False + + # -threads number missing + def test4(self): + argv = ["-group", "test_group", "-threads", "-output", "./"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.max_worker_threads == 1 + + # -threads number missing at end + def test5(self): + argv = ["", "-threads"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.max_worker_threads == 1 + + # -group follow up missing at end + def test6(self): + argv = ["", "-group"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.group_is_set == False + + + # -output follow up missing at end + def test7(self): + argv = ["-group", "test_group", "-output"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == True + assert crawler.output_is_set == True + assert crawler.output == "./" + + + # -v is set + def test8(self): + argv = ["", "-v"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.verbose == True + + # -start follow up missing + def test9(self): + argv = ["-group", "test_group", "-start", "-output", "./"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.start_date_is_set == False + + + # start format check + def test10(self): + crawler = Crawler() + assert crawler.check_date_format("11.11.11") == True + assert crawler.check_date_format("111.11.11") == False + assert crawler.check_date_format("11.111.11") == False + assert crawler.check_date_format("11.11.111") == False + + +class checkRegexFunction(unittest.TestCase): + # href + def test1(self): + crawler = Crawler() + line = '27.04.10 Date: Tue, 12 Apr 2016 09:27:23 +0200 Subject: [PATCH 3/3] add copyright header and update help output --- codeface/google_groups_py/g3d.py | 22 +++++++++++++++++++--- codeface/google_groups_py/run_g3d.py | 19 ++++++++++++++++++- codeface/google_groups_py/utest_g3d.py | 18 +++++++++++++++++- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/codeface/google_groups_py/g3d.py b/codeface/google_groups_py/g3d.py index 38c9045c..462f789d 100644 --- a/codeface/google_groups_py/g3d.py +++ b/codeface/google_groups_py/g3d.py @@ -1,3 +1,19 @@ +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2016 by Georg Berner +## All Rights Reserved. + import re import os.path import urllib2 @@ -88,12 +104,12 @@ def print_help(): print "################################################################################" print "Help output:" print "################################################################################" - print "crawler.py -output \"path\" -group \"groupname\" [-threads \"#threads\",\n -start \"DD.MM.YY\"]" + print "run_g3d.py -output \"path\" -group \"groupname\" [-threads \"#threads\",\n -start \"DD.MM.YY\"]" print "################################################################################" print "-output: output path for ggc" print "-group: name of the google group" - print "-threads: number of threads for simultaneous download (default = 1) (only for dl)" - print "-start: date from which all messages of a group will be downloaded, \"DD.MM.YY\" (only for dl)" + print "-threads: number of threads for simultaneous download (default = 1)" + print "-start: date from which all messages of a group will be downloaded, \"DD.MM.YY\"" print "-v: verbose output" print "################################################################################" diff --git a/codeface/google_groups_py/run_g3d.py b/codeface/google_groups_py/run_g3d.py index 2ef20e06..9be6c263 100644 --- a/codeface/google_groups_py/run_g3d.py +++ b/codeface/google_groups_py/run_g3d.py @@ -1,3 +1,20 @@ +#! /usr/bin/env python +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2016 by Georg Berner +## All Rights Reserved. + from g3d import Crawler import sys @@ -9,4 +26,4 @@ def main(): crawler.start() crawler.statistic() -main() \ No newline at end of file +main() diff --git a/codeface/google_groups_py/utest_g3d.py b/codeface/google_groups_py/utest_g3d.py index 4f8ddd21..82e77740 100644 --- a/codeface/google_groups_py/utest_g3d.py +++ b/codeface/google_groups_py/utest_g3d.py @@ -1,3 +1,19 @@ +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2016 by Georg Berner +## All Rights Reserved. + from g3d import Crawler import unittest @@ -225,4 +241,4 @@ def test7(self): date1 = "11.11.11" date2 = "11.11.11" result = crawler.compare_date(date1, date2) - assert result == 0 \ No newline at end of file + assert result == 0