diff --git a/codeface/google_groups_py/g3d.py b/codeface/google_groups_py/g3d.py new file mode 100644 index 00000000..462f789d --- /dev/null +++ b/codeface/google_groups_py/g3d.py @@ -0,0 +1,350 @@ +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2016 by Georg Berner +## All Rights Reserved. + +import re +import os.path +import urllib2 +import time +import sys +import threading +import concurrent.futures + + +class Crawler(object): + + def __init__(self): + self.group = "" + self.output = "" + self.link_topic = "https://groups.google.com/forum/?_escaped_fragment_=topic/" + self.link_msg = "https://groups.google.com/forum/message/raw?msg=" + self.link_forum = "https://groups.google.com/forum/?_escaped_fragment_=forum/" + self.opener = urllib2.build_opener() + self.opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox')] + self.current_last_post_date = "00.00.00" + self.max_worker_threads = 1 + self.group_is_set = False + self.output_is_set = False + self.start_date_is_set = False + self.verbose = False + self.execute = False + self.lock = threading.Lock() + self.counter = 0 + self.elapsed_time = 0 + print "------------------------g3d-------------------------" + print "------- Google Groups Group Downloader -----\n" + + # end execution if arguments aren't complete or help is displayed + def handle_arguments(self, argv): + if len(argv) is 1: + self.print_help() + return + for i in xrange(len(argv)): + if argv[i] == "-h" or argv[i] == "--h" or argv[i] == "-help": + self.print_help() + return + if argv[i] == "-v": + self.verbose = True + continue + if argv[i] == "-output": + if i+1 == len(argv): + break + self.set_output(argv[i+1]) + self.output_is_set = True + continue + if argv[i] == "-group": + if i+1 == len(argv): + break + self.group = argv[i+1] + self.group_is_set = True + continue + if argv[i] == "-threads": + if i+1 == len(argv): + print "Error: amount of threads missing" + return + try: + self.max_worker_threads = int(argv[i+1]) + except ValueError: + print "\nError: argument following -threads is not a number" + return + continue + if argv[i] == "-start": + if i+1 == len(argv): + print "Error: start date missing" + return + self.current_last_post_date = argv[i+1] + if self.check_date_format(self.current_last_post_date) is False: + print"Error: date wrong format" + return + self.start_date_is_set = True + continue + if self.group_is_set is False: + print "Error: missing group" + return + if self.output_is_set is False: + print "Warning: missing output" + self.set_output("./") + self.output_is_set = True + self.execute = True + + @staticmethod + def print_help(): + print "################################################################################" + print "Help output:" + print "################################################################################" + print "run_g3d.py -output \"path\" -group \"groupname\" [-threads \"#threads\",\n -start \"DD.MM.YY\"]" + print "################################################################################" + print "-output: output path for ggc" + print "-group: name of the google group" + print "-threads: number of threads for simultaneous download (default = 1)" + print "-start: date from which all messages of a group will be downloaded, \"DD.MM.YY\"" + print "-v: verbose output" + print "################################################################################" + + # checks if date is in format DD.MM.YY + @staticmethod + def check_date_format(date): + result = re.findall("^\\d\\d.\\d\\d.\\d\\d$", date) + if len(result) is 1: + return True + return False + + def set_output(self, output): + if str(output).endswith("/") or str(output).endswith("\\"): + self.output = output + else: + self.output = output + "/" + + def start(self): + start_time = time.time() + try: + self.opener.open(self.link_forum + self.group) + except urllib2.HTTPError: + print "\nError: could not locate group" + sys.exit(0) + if self.start_date_is_set is True: + self.get_group_from_start() + else: + self.get_complete_group() + self.elapsed_time = time.time()-start_time + + def get_group_from_start(self): + self.make_dir() + with open(self.output + self.group + ".mbox", "w"): + print "creating output file" + with open(self.output + "threads.txt", 'w') as file_threads: + file_threads.write("XXX," + self.current_last_post_date + ",\n") + self.get_threads() + + def get_complete_group(self): + print "Entered group: " + self.group + print "complete group will be downloaded\n" + self.make_dir() + with open(self.output + self.group + ".mbox", "w"): + print "creating output file" + if self.verbose is False: + print "starting download, please wait..." + self.get_threads() + + def get_threads(self): + try: + html = self.opener.open(self.link_forum + self.group) + except urllib2.HTTPError: + return + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_worker_threads) as executor: + while True: + next_page = self.get_threads_and_date(html, executor) + if next_page == "": + break + html = self.opener.open(next_page) + + def get_threads_and_date(self, html, executor): + next_page = "" + current_id = "" + for line in html: + last_post_date = self.search_last_post_date(line) + if len(last_post_date) > 0: + if ":" in last_post_date: + last_post_date = time.strftime("%d.%m.%y") + if self.compare_date(last_post_date, self.current_last_post_date) == -1: + next_page = "" + break + if last_post_date == self.current_last_post_date: + return "" + executor.submit(self.get_messages, current_id) + href = self.search_href(line) + if len(href) == 0: + continue + topic_id = self.search_topic_id(self.group, href) + if "topic" in href: + current_id = topic_id + if "forum" in href: + next_page = href + return next_page + + @staticmethod + def search_topic_id(group, line): + temp = re.findall("(?<=" + group + "/)(.*)", line) + if len(temp) == 0: + return "" + return temp[0] + + @staticmethod + def search_href(line): + temp = re.findall("(href=\")(.*?)(\")", line) + if len(temp) == 0: + return "" + return temp[0][1] + + @staticmethod + def search_last_post_date(line): + temp = re.findall("(?<=\"lastPostDate\">)([0-9\.:]*)", line) + if len(temp) == 0: + return "" + return temp[0] + + # gets executed in own thread + def get_messages(self, topic_id): + counter = 0 + if self.verbose is True: + print "getting thread " + topic_id + try: + html = self.opener.open(self.link_topic + self.group + "/" + topic_id) + except urllib2.HTTPError: + if self.verbose is True: + print "skipping deleted thread " + topic_id + return + msg_id = "" + for line in html: + last_post_date = self.search_last_post_date(line) + if len(last_post_date) > 0: + if len(self.current_last_post_date) > 0: + if self.compare_date(last_post_date, self.current_last_post_date) < 1: + continue + self.get_mbox(topic_id, msg_id) + counter += 1 + href = self.search_href(line) + if len(href) == 0: + continue + if "/msg/" + self.group + "/" in href: + msg_id = self.search_msg_id(topic_id, href) + if len(last_post_date) > 0: + if last_post_date == self.current_last_post_date: + if self.verbose is True: + print str(counter) + " messages for topic " + topic_id + " downloaded." + return "" + if self.verbose is True: + print str(counter) + " messages for topic " + topic_id + " downloaded." + + @staticmethod + def search_msg_id(topic_id, href): + return re.findall("(?<=" + topic_id + "/)(.*)", href)[0] + + def get_mbox(self, topic_id, msg_id): + if self.verbose is True: + print "\tdownloading -> topic: " + topic_id + " msg: " + msg_id + try: + mbox = self.opener.open(self.link_msg + self.group + "/" + topic_id + "/" + msg_id).read() + except urllib2.HTTPError: + if self.verbose is True: + print "skipping deleted message " + msg_id + return + self.add_to_mbox(mbox) + + @staticmethod + def compare_date(date1, date2): + # date format: dd.mm.yy + date1_split = date1.split(".") + date2_split = date2.split(".") + if int(date1_split[2]) == int(date2_split[2]): + # year1 == year2 + if int(date1_split[1]) == int(date2_split[1]): + # month1 == month + if int(date1_split[0]) < int(date2_split[0]): + # day1 < day2 + return -1 + if int(date1_split[0]) == int(date2_split[0]): + # day1 == day2 + return 0 + else: + # day1 > day2 + return 1 + if int(date1_split[1]) < int(date2_split[1]): + return -1 + else: + return 1 + if int(date1_split[2]) < int(date2_split[2]): + return -1 + else: + return 1 + + # creates subdirectories if they don't exist + def make_dir(self): + if not os.path.isdir(self.output): + print "creating directory\n" + os.makedirs(self.output) + + def add_to_mbox(self, raw): + clean_mbox = "\n\n\nFrom Google Groups\n" + self.check_mbox(raw) + with self.lock: + with open(self.output + self.group + ".mbox", "a") as final_mbox: + final_mbox.write(clean_mbox) + self.counter += 1 + + def check_mbox(self, raw): + text = raw.split("\n") + clean = "" + for line in text: + if line.startswith("From "): + line = " " + line + if line.startswith("From: "): + line = self.handle_from_line(line) + "\n" + clean += line + return clean + + @staticmethod + def search_name(line): + temp = re.findall("(?<=From: )\"?[ A-Za-z]*", line) + if len(temp) > 0: + name = temp[0].replace('"', '') + if name.endswith(" "): + name = name[:-1] + return name + return "" + + @staticmethod + def search_email(line, name): + email = re.findall("<.*>", line) + if len(email) > 0: + if "..." in email[0]: + email_suffix = re.findall("@[a-zA-Z0-1\-.]*", line) + if len(email_suffix) != 0: + email_suffix = email_suffix[0] + else: + email_suffix = "@unknown.com" + return "<" + name.replace(" ", "_") + email_suffix + ">" + return "" + + def handle_from_line(self, line): + name = self.search_name(line) + if len(name) != 0: + email = self.search_email(line, name) + if len(email) > 0: + return "From: " + name + " " + email + return line + + def statistic(self): + print "download complete" + print "g3d downloaded " + str(self.counter) + " messages in " + str(int(self.elapsed_time)) + " seconds." diff --git a/codeface/google_groups_py/run_g3d.py b/codeface/google_groups_py/run_g3d.py new file mode 100644 index 00000000..9be6c263 --- /dev/null +++ b/codeface/google_groups_py/run_g3d.py @@ -0,0 +1,29 @@ +#! /usr/bin/env python +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2016 by Georg Berner +## All Rights Reserved. + +from g3d import Crawler +import sys + +def main(): + argv = sys.argv + crawler = Crawler() + crawler.handle_arguments(argv) + if crawler.execute is True: + crawler.start() + crawler.statistic() + +main() diff --git a/codeface/google_groups_py/utest_g3d.py b/codeface/google_groups_py/utest_g3d.py new file mode 100644 index 00000000..82e77740 --- /dev/null +++ b/codeface/google_groups_py/utest_g3d.py @@ -0,0 +1,244 @@ +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2016 by Georg Berner +## All Rights Reserved. + +from g3d import Crawler +import unittest + + +class argumentHandlingTests(unittest.TestCase): + # no errors + def test1(self): + argv = ["-group", "test_group", "-start", "11.11.11", "-output", "./", "-threads", "50"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == True + assert crawler.group_is_set == True + assert crawler.group == "test_group" + assert crawler.max_worker_threads == 50 + assert crawler.output_is_set == True + assert crawler.output == "./" + assert crawler.verbose == False + + # -output missing -> should work + def test2(self): + argv = ["-group", "test_group", "-threads", "50"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == True + assert crawler.output_is_set == True + assert crawler.output == "./" + + # -group missing + def test3(self): + argv = ["-output", "./", "-threads", "50"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.group_is_set == False + + # -threads number missing + def test4(self): + argv = ["-group", "test_group", "-threads", "-output", "./"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.max_worker_threads == 1 + + # -threads number missing at end + def test5(self): + argv = ["", "-threads"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.max_worker_threads == 1 + + # -group follow up missing at end + def test6(self): + argv = ["", "-group"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == False + assert crawler.group_is_set == False + + + # -output follow up missing at end + def test7(self): + argv = ["-group", "test_group", "-output"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.execute == True + assert crawler.output_is_set == True + assert crawler.output == "./" + + + # -v is set + def test8(self): + argv = ["", "-v"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.verbose == True + + # -start follow up missing + def test9(self): + argv = ["-group", "test_group", "-start", "-output", "./"] + crawler = Crawler() + crawler.handle_arguments(argv) + assert crawler.start_date_is_set == False + + + # start format check + def test10(self): + crawler = Crawler() + assert crawler.check_date_format("11.11.11") == True + assert crawler.check_date_format("111.11.11") == False + assert crawler.check_date_format("11.111.11") == False + assert crawler.check_date_format("11.11.111") == False + + +class checkRegexFunction(unittest.TestCase): + # href + def test1(self): + crawler = Crawler() + line = '27.04.10