diff --git a/.gitignore b/.gitignore index 5de8a89..65f940a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__ credentials.ini - +/.idea diff --git a/api/Comment.py b/api/Comment.py index 24bbbd1..f86d232 100644 --- a/api/Comment.py +++ b/api/Comment.py @@ -9,7 +9,7 @@ class search: params = None def on_get(self, req, resp): start = time.time() - q = req.get_param('q'); + q = req.get_param('q') self.params = req.params if 'ids' in self.params: data = self.getIds(self.params['ids']) diff --git a/api/DBFunctions.py b/api/DBFunctions.py index e8e214e..82f3fab 100644 --- a/api/DBFunctions.py +++ b/api/DBFunctions.py @@ -3,7 +3,8 @@ from configparser import ConfigParser config = ConfigParser() -config.read ('credentials.ini') +config.read('credentials.ini') + class pgdb: @@ -11,18 +12,19 @@ def __init__(self): self.connect() def connect(self): - DB_PASSWORD = config.get('database','password') - DB_USER = config.get('database','user') - self.db = psycopg2.connect("dbname='reddit' user='" + DB_USER + "' host='jupiter' password='" + DB_PASSWORD + "'") + DB_PASSWORD = config.get('database', 'password') + DB_USER = config.get('database', 'user') + self.db = psycopg2.connect( + "dbname='reddit' user='" + DB_USER + "' host='jupiter' password='" + DB_PASSWORD + "'") self.db.set_session(autocommit=True) - def execute(self,sql,params): + def execute(self, sql, params): retries = 5 while True: retries -= 1 try: cur = self.db.cursor() - cur.execute(sql,(params,)) + cur.execute(sql, (params,)) rows = cur.fetchall() cur.close() return rows @@ -35,5 +37,5 @@ def execute(self,sql,params): except: raise -pgdb = pgdb() +pgdb = pgdb() diff --git a/api/Helpers.py b/api/Helpers.py index 8a32a0c..4dc1cfa 100644 --- a/api/Helpers.py +++ b/api/Helpers.py @@ -3,6 +3,7 @@ import json import DBFunctions + def LooksLikeInt(s): try: int(s) @@ -10,6 +11,7 @@ def LooksLikeInt(s): except ValueError: return False + def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'): """Converts an integer to a base36 string.""" if not isinstance(number, (int, int)): @@ -31,9 +33,11 @@ def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'): return sign + base36 + def base36decode(number): return int(number, 36) + def getSubmissionsFromES(ids): nested_dict = lambda: defaultdict(nested_dict) if not isinstance(ids, (list, tuple)): @@ -52,11 +56,12 @@ def getSubmissionsFromES(ids): results[base_10_id] = source return results + def getSubmissionsFromPg(ids): if not isinstance(ids, (list, tuple)): ids = [ids] ids_to_get_from_db = [] - rows = DBFunctions.pgdb.execute("SELECT * FROM submission WHERE (json->>'id')::int IN %s LIMIT 5000",tuple(ids)) + rows = DBFunctions.pgdb.execute("SELECT * FROM submission WHERE (json->>'id')::int IN %s LIMIT 5000", tuple(ids)) results = {} data = {} if rows: diff --git a/api/Parameters.py b/api/Parameters.py index 869c098..0919877 100644 --- a/api/Parameters.py +++ b/api/Parameters.py @@ -4,12 +4,13 @@ import json import time -def process(params,q): + +def process(params, q): nested_dict = lambda: defaultdict(nested_dict) - params = {k.lower(): v for k, v in params.items()} # Lowercase all parameter names passed - suggested_sort = "desc"; + params = {k.lower(): v for k, v in params.items()} # Lowercase all parameter names passed + suggested_sort = "desc" - conditions = ["subreddit","author"] + conditions = ["subreddit", "author"] for condition in conditions: if condition in params and params[condition] is not None: terms = nested_dict() @@ -54,6 +55,40 @@ def process(params,q): else: params['before'] = None + if 'retrieved_before' in params and params['retrieved_before'] is not None: + if LooksLikeInt(params['retrieved_before']): + params['retrieved_before'] = int(params['retrieved_before']) + elif params['retrieved_before'][-1:].lower() == "d": + params['retrieved_before'] = int(time.time()) - (int(params['retrieved_before'][:-1]) * 86400) + elif params['retrieved_before'][-1:].lower() == "h": + params['retrieved_before'] = int(time.time()) - (int(params['retrieved_before'][:-1]) * 3600) + elif params['retrieved_before'][-1:].lower() == "m": + params['retrieved_before'] = int(time.time()) - (int(params['retrieved_before'][:-1]) * 60) + elif params['retrieved_before'][-1:].lower() == "s": + params['retrieved_before'] = int(time.time()) - (int(params['retrieved_before'][:-1])) + range = nested_dict() + range['range']['retrieved_on']['lt'] = params['retrieved_before'] + q['query']['bool']['filter'].append(range) + else: + params['retrieved_before'] = None + + if 'retrieved_after' in params and params['retrieved_after'] is not None: + if LooksLikeInt(params['retrieved_after']): + params['retrieved_after'] = int(params['retrieved_after']) + elif params['retrieved_after'][-1:].lower() == "d": + params['retrieved_after'] = int(time.time()) - (int(params['retrieved_after'][:-1]) * 86400) + elif params['retrieved_after'][-1:].lower() == "h": + params['retrieved_after'] = int(time.time()) - (int(params['retrieved_after'][:-1]) * 3600) + elif params['retrieved_after'][-1:].lower() == "m": + params['retrieved_after'] = int(time.time()) - (int(params['retrieved_after'][:-1]) * 60) + elif params['retrieved_after'][-1:].lower() == "s": + params['retrieved_after'] = int(time.time()) - (int(params['retrieved_after'][:-1])) + range = nested_dict() + range['range']['retrieved_on']['lt'] = params['retrieved_after'] + q['query']['bool']['filter'].append(range) + else: + params['retrieved_after'] = None + if 'score' in params and params['score'] is not None: range = nested_dict() if params['score'][:1] == "<": @@ -74,13 +109,13 @@ def process(params,q): range['term']['num_comments'] = int(params['num_comments']) q['query']['bool']['filter'].append(range) - conditions = ["over_18","is_video","stickied","spoiler","locked","contest_mode"] + conditions = ["over_18", "is_video", "stickied", "spoiler", "locked", "contest_mode"] for condition in conditions: if condition in params and params[condition] is not None: parameter = nested_dict() if params[condition].lower() == 'true' or params[condition] == "1": - parameter['term'][condition] = "true" - print ("Got here") + parameter['term'][condition] = "true" + print("Got here") elif params[condition].lower() == 'false' or params[condition] == "0": parameter['term'][condition] = "false" q['query']['bool']['filter'].append(parameter) @@ -108,10 +143,9 @@ def process(params,q): params['sort'] = suggested_sort q['sort'][params['sort_type']] = params['sort'] - if 'frequency' in params and params['frequency'].lower() in ['second','minute','hour','day','week','month']: + if 'frequency' in params and params['frequency'].lower() in ['second', 'minute', 'hour', 'day', 'week', 'month']: params['frequency'] = params['frequency'].lower() else: params['frequency'] = None - return(params,q) - + return (params, q) diff --git a/api/Submission.py b/api/Submission.py index 3693348..43dfe71 100644 --- a/api/Submission.py +++ b/api/Submission.py @@ -7,22 +7,23 @@ class search: params = None + def on_get(self, req, resp): self.start = time.time() - q = req.get_param('q'); + q = req.get_param('q') self.params = req.params if 'ids' in self.params: data = self.getIds(self.params['ids']) end = time.time() data["metadata"] = {} - data["metadata"]["execution_time_milliseconds"] = round((end - self.start) * 1000,2) + data["metadata"]["execution_time_milliseconds"] = round((end - self.start) * 1000, 2) data["metadata"]["version"] = "v3.0" - resp.cache_control = ["public","max-age=2","s-maxage=2"] - resp.body = json.dumps(data,sort_keys=True,indent=4, separators=(',', ': ')) + resp.cache_control = ["public", "max-age=2", "s-maxage=2"] + resp.body = json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')) return - response = self.search("http://mars:9200/rs/submissions/_search"); + response = self.search("http://mars:9200/rs/submissions/_search") results = [] data = {} for hit in response["data"]["hits"]["hits"]: @@ -61,8 +62,9 @@ def on_get(self, req, resp): data['aggs'] = {} if 'subreddit' in response['data']['aggregations']: for bucket in response['data']['aggregations']['subreddit']['buckets']: - bucket['score'] = round(bucket['doc_count'] / bucket['bg_count'],5) - newlist = sorted(response['data']['aggregations']['subreddit']['buckets'], key=lambda k: k['score'], reverse=True) + bucket['score'] = round(bucket['doc_count'] / bucket['bg_count'], 5) + newlist = sorted(response['data']['aggregations']['subreddit']['buckets'], key=lambda k: k['score'], + reverse=True) data['aggs']['subreddit'] = newlist if 'author' in response['data']['aggregations']: @@ -89,37 +91,39 @@ def on_get(self, req, resp): if 'time_of_day' in response['data']['aggregations']: for bucket in response['data']['aggregations']['time_of_day']['buckets']: - bucket['bg_percentage'] = round(bucket['bg_count'] * 100 / response['data']['aggregations']['time_of_day']['bg_count'], 5) - bucket['doc_percentage'] = round(bucket['doc_count'] * 100 / response['data']['aggregations']['time_of_day']['doc_count'], 5) - bucket['deviation_percentage'] = round(bucket['doc_percentage'] - bucket['bg_percentage'],4) + bucket['bg_percentage'] = round( + bucket['bg_count'] * 100 / response['data']['aggregations']['time_of_day']['bg_count'], 5) + bucket['doc_percentage'] = round( + bucket['doc_count'] * 100 / response['data']['aggregations']['time_of_day']['doc_count'], 5) + bucket['deviation_percentage'] = round(bucket['doc_percentage'] - bucket['bg_percentage'], 4) bucket['utc_hour'] = bucket['key'] bucket.pop('score', None) - bucket.pop('key',None) - newlist = sorted(response['data']['aggregations']['time_of_day']['buckets'], key=lambda k: k['utc_hour']) + bucket.pop('key', None) + newlist = sorted(response['data']['aggregations']['time_of_day']['buckets'], + key=lambda k: k['utc_hour']) data['aggs']['time_of_day'] = newlist - end = time.time() - data['data'] = results; + data['data'] = results data['metadata'] = {} data['metadata'] = response['metadata'] data['metadata'] = self.params - data['metadata']['execution_time_milliseconds'] = round((end - self.start) * 1000,2) + data['metadata']['execution_time_milliseconds'] = round((end - self.start) * 1000, 2) data['metadata']['version'] = 'v3.0' data['metadata']['results_returned'] = len(response['data']['hits']['hits']) data['metadata']['timed_out'] = response['data']['timed_out'] data['metadata']['total_results'] = response['data']['hits']['total'] data['metadata']['shards'] = {} data['metadata']['shards'] = response['data']['_shards'] - resp.cache_control = ['public','max-age=2','s-maxage=2'] - resp.body = json.dumps(data,sort_keys=True,indent=4, separators=(',', ': ')) + resp.cache_control = ['public', 'max-age=2', 's-maxage=2'] + resp.body = json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')) def search(self, uri): nested_dict = lambda: defaultdict(nested_dict) q = nested_dict() q['query']['bool']['filter'] = [] q['query']['bool']['must_not'] = [] - self.params, q = Parameters.process(self.params,q) + self.params, q = Parameters.process(self.params, q) if 'q' in self.params and self.params['q'] is not None: sqs = nested_dict() @@ -127,7 +131,7 @@ def search(self, uri): sqs['simple_query_string']['default_operator'] = 'and' q['query']['bool']['filter'].append(sqs) - conditions = ["title","selftext"] + conditions = ["title", "selftext"] for condition in conditions: if condition in self.params and self.params[condition] is not None: sqs = nested_dict() @@ -147,7 +151,8 @@ def search(self, uri): q['query']['bool']['must_not'].append(sqs) min_doc_count = 0 - if 'min_doc_count' in self.params and self.params['min_doc_count'] is not None and LooksLikeInt(self.params['min_doc_count']): + if 'min_doc_count' in self.params and self.params['min_doc_count'] is not None and LooksLikeInt( + self.params['min_doc_count']): min_doc_count = self.params['min_doc_count'] if 'aggs' in self.params: @@ -158,16 +163,17 @@ def search(self, uri): q['aggs']['subreddit']['significant_terms']['field'] = 'subreddit.keyword' q['aggs']['subreddit']['significant_terms']['size'] = 1000 q['aggs']['subreddit']['significant_terms']['script_heuristic']['script']['lang'] = 'painless' - q['aggs']['subreddit']['significant_terms']['script_heuristic']['script']['inline'] = 'params._subset_freq' + q['aggs']['subreddit']['significant_terms']['script_heuristic']['script'][ + 'inline'] = 'params._subset_freq' q['aggs']['subreddit']['significant_terms']['min_doc_count'] = min_doc_count if agg.lower() == 'author': q['aggs']['author']['terms']['field'] = 'author.keyword' q['aggs']['author']['terms']['size'] = 1000 q['aggs']['author']['terms']['order']['_count'] = 'desc' - #q['aggs']['author']['significant_terms']['script_heuristic']['script']['lang'] = 'painless' - #q['aggs']['author']['significant_terms']['script_heuristic']['script']['inline'] = 'params._subset_freq' - #q['aggs']['author']['significant_terms']['min_doc_count'] = min_doc_count + # q['aggs']['author']['significant_terms']['script_heuristic']['script']['lang'] = 'painless' + # q['aggs']['author']['significant_terms']['script_heuristic']['script']['inline'] = 'params._subset_freq' + # q['aggs']['author']['significant_terms']['min_doc_count'] = min_doc_count if agg.lower() == 'created_utc': q['aggs']['created_utc']['date_histogram']['field'] = 'created_utc' @@ -230,6 +236,7 @@ def getIds(self, ids): data["metadata"] = {} return data + class getCommentIDs: def on_get(self, req, resp, submission_id): @@ -237,14 +244,15 @@ def on_get(self, req, resp, submission_id): if submission_id[:3] == 't3_': submission_id = submission_id[3:] submission_id = base36decode(submission_id) - rows = DBFunctions.pgdb.execute("SELECT (json->>'id')::bigint comment_id FROM comment WHERE (json->>'link_id')::int = %s ORDER BY comment_id ASC LIMIT 50000",submission_id) + rows = DBFunctions.pgdb.execute( + "SELECT (json->>'id')::bigint comment_id FROM comment WHERE (json->>'link_id')::int = %s ORDER BY comment_id ASC LIMIT 50000", + submission_id) results = [] data = {} if rows: for row in rows: comment_id = row[0] results.append(base36encode(comment_id)) - data['data'] = results; - resp.cache_control = ["public","max-age=5","s-maxage=5"] - resp.body = json.dumps(data,sort_keys=True,indent=4, separators=(',', ': ')) - + data['data'] = results + resp.cache_control = ["public", "max-age=5", "s-maxage=5"] + resp.body = json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')) diff --git a/api/User.py b/api/User.py index e3aefd6..ce09875 100644 --- a/api/User.py +++ b/api/User.py @@ -19,7 +19,7 @@ def on_get(self, req, resp, author): if author is not None: terms = nested_dict() - terms['terms']['author'] = [author.lower()] + terms['terms']['author'] = [author.lower()] q['query']['bool']['filter'].append(terms) q['size'] = size @@ -40,7 +40,7 @@ def on_get(self, req, resp, author): request = requests.get(searchURL, data=json.dumps(q)) response = json.loads(request.text) - if response.get('aggregations', {}).get('link_id', {}).get('buckets',{}): + if response.get('aggregations', {}).get('link_id', {}).get('buckets', {}): for row in response['aggregations']['link_id']['buckets']: row['key'] = 't3_' + base36encode(row['key']) @@ -48,10 +48,7 @@ def on_get(self, req, resp, author): data = {} data['data'] = response data['metadata'] = {} - data['metadata']['execution_time_milliseconds'] = round((end - start) * 1000,2) + data['metadata']['execution_time_milliseconds'] = round((end - start) * 1000, 2) data['metadata']['version'] = 'v3.0' - resp.cache_control = ['public','max-age=2','s-maxage=2'] - resp.body = json.dumps(data,sort_keys=True,indent=4, separators=(',', ': ')) - - - + resp.cache_control = ['public', 'max-age=2', 's-maxage=2'] + resp.body = json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')) diff --git a/api/api.py b/api/api.py index 7c58cd3..268f74e 100755 --- a/api/api.py +++ b/api/api.py @@ -19,7 +19,6 @@ from Helpers import * from configparser import ConfigParser - api = falcon.API() api.add_route('/reddit/search', Comment.search()) api.add_route('/reddit/comment/search', Comment.search()) @@ -29,5 +28,3 @@ api.add_route('/reddit/analyze/user/{author}', User.Analyze()) api.add_route('/get/comment_ids/{submission_id}', Submission.getCommentIDs()) api.add_route('/reddit/submission/comment_ids/{submission_id}', Submission.getCommentIDs()) - -