forked from ankushagrawal94/TheHackerNewsBump
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyzeData.py
288 lines (249 loc) · 9.21 KB
/
analyzeData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import MySQLdb
import time
import re
import datetime
start_time = time.time()
elapsed_time = time.time()
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="root", # your username
passwd="password", # your password
db="githubDB") # name of the data base
cur = db.cursor()
event_num = 0
completed_events = 0
big_event_count = 0
start = 0
stop = 160
infinite_growth = 0
hn_val_counter = 0
try:
hn_point_params = [5,10,50,100,150,200,250,300,400,500]
gh_point_params = [5,10,50,100,500,1000,2500,5000,7500,10000,20000,30000,40000,50000,60000,70000]
for point_val in hn_point_params:
gh_point_counter = 0
hn_val_counter += 1
for star_val in gh_point_params:
event_num += 1
gh_point_counter += 1
if event_num < start:
continue
if event_num > stop:
continue
print "You are on big_pic #%s of %s" % (event_num, int(len(hn_point_params) * len(gh_point_params)))
print "Beginning analysis for hn_points = %s & gh_stars = %s" % (point_val, star_val)
print "Total elapsed time is: %s seconds" % int(time.time() - start_time)
print "Performing Query ... \n"
try:
cur.execute("SELECT * FROM hn_event_max WHERE stars BETWEEN %s AND %s AND hn_points BETWEEN %s AND %s" % (star_val, gh_point_params[gh_point_counter], point_val, hn_point_params[hn_val_counter]))
except:
try:
cur.execute("SELECT * FROM hn_event_max WHERE stars > %s AND hn_points BETWEEN %s AND %s" % (star_val, point_val, hn_point_params[hn_val_counter]))
except:
cur.execute("SELECT * FROM hn_event_max WHERE stars > %s AND hn_points > %s" % (star_val, point_val))
hn_event_list = cur.fetchall()
if len(hn_event_list) == 0:
print "Skipping event# %s; No big_pic with %s points and %s stars" % (event_num, star_val, point_val)
continue
global_percent_change = [0] * 14
global_percent_avg = [0] * 14
global_raw_stars = [0] * 15
global_raw_star_avg = [0] * 15
event_count = 0
prev_row = ''
hn_event_list_size = len(hn_event_list)
for event in hn_event_list:
event_count += 1
repo_name = event[0]
stars = event[1]
hn_points = event[2]
event_time = event[3]
#skip duplicates
if event[0] == prev_row:
prev_row = event[0]
hn_event_list_size -= 1
continue
else:
prev_row = event[0]
raw_num_stars = [0] * 15
repo_percent_change = []
print "Beginning analysis for: %s" % repo_name
#print "total elapsed time is: %s seconds" % int(time.time() - start_time)
#Get all 15 days
daily_event_times = [] #store event times for each day
daily_star_count = [] #store stars for each day
start_date = event_time + datetime.timedelta(days = -7)
end_date = event_time + datetime.timedelta(days = 7)
cur.execute(("SELECT * FROM event_table_condensed WHERE repo_name = \"%s\" AND event_time BETWEEN \"%s\" AND \"%s\" ") % (repo_name, start_date, end_date))
flag = False #tells whether you entered cur.fetchall for loop
for row in cur.fetchall():
#ET refers to event_table
ET_repo_name = row[0]
ET_stars = row[1]
ET_event_time = row[2]
#cur.execute(("INSERT INTO event_table_condensed (repo_name, stars, event_time) VALUES (%s, %s, %s)"), (ET_repo_name, ET_stars, ET_event_time))
#db.commit()
daily_event_times.append(ET_event_time)
daily_star_count.append(ET_stars)
flag = True
i = 0
while i < 15:
curr_day = start_date + datetime.timedelta(days = i)
try:
indexOfDay = daily_event_times.index(curr_day)
raw_num_stars[indexOfDay] = daily_star_count[indexOfDay]
except Exception, e:
pass
i += 1
print "total elapsed time is: %s seconds" % int(time.time() - start_time)
#print "Fixing Erroneous Data from: "
#print raw_num_stars
i = 0
raw_star_count = 0
while i < 15:
raw_star_count += raw_num_stars[i]
i += 1
if raw_star_count == 0:
hn_event_list_size -= 1
print "All zero data. Skipping"
print '--------------------------------------'
continue
i = 0
first_non_zero = 0
while i < 15:
if raw_num_stars[i] != 0:
first_non_zero = raw_num_stars[i]
break
i += 1
if raw_num_stars[0] == 0:
raw_num_stars[0] = first_non_zero
i = 1
while i < len(raw_num_stars):
if raw_num_stars[i] == 0:
raw_num_stars[i] = raw_num_stars[i-1]
i += 1
i = 0
raw_star_count = 0
while i < 15:
raw_star_count += raw_num_stars[i]
i += 1
if raw_star_count == 0:
print "RAW STAR COUNT IS 0. SHOULD NOT BE. SKIPPING THIS DATA POINT"
break
#print "Adjusted Data is: "
#print raw_num_stars
print "processing list"
#Process the list
i = 1
while i < 15:
repo_percent_change.append((float(raw_num_stars[i]) - float(raw_num_stars[i - 1]))/float(raw_num_stars[i - 1]))
i += 1
i = 0
while i < len(repo_percent_change):
global_percent_change[i] += repo_percent_change[i]
i += 1
i = 0
while i < len(raw_num_stars):
global_raw_stars[i] += raw_num_stars[i]
i += 1
completed_events += 1
#print "the raw_num_stars is:"
#print raw_num_stars
#print "the global_raw_stars is:"
#print global_raw_stars
#print "repo name is: %s" % repo_name
#print "global raw stars is %s" % global_raw_stars
#print "repo percent change is %s" % repo_percent_change
#print "global percent change is %s" % global_percent_change
print "\ncompleted event #%s of %s" % (event_count, hn_event_list_size)
print "total elapsed time is: %s seconds" % int(time.time() - start_time)
print "average time per repo is: %s seconds" % int((time.time() - start_time)/completed_events)
print '--------------------------------------'
i = 0
raw_star_count = 0
while i < 15:
raw_star_count += global_raw_stars[i]
i += 1
if raw_star_count == 0:
print "All zero data. Skipping"
print '--------------------------------------'
continue
print "calculating global data..."
global_percent_change.insert(0,0)
global_percent_avg.insert(0,0)
i = 0
while i < len(global_percent_change):
global_percent_avg[i] = global_percent_change[i]/hn_event_list_size
global_raw_star_avg[i] = global_raw_stars[i]/hn_event_list_size
i += 1
i = 0
while i < len(global_raw_stars):
global_raw_star_avg[i] = global_raw_stars[i]/hn_event_list_size
i += 1
print "successfully created global_percent_avg and global_raw_star_avg"
base = 1
total_sum = base
i = 0
while i < 14:
total_sum *= (1 + global_percent_change[i])
i += 1
mid = base
i = 0
while i < 8:
mid *= (1 + global_percent_change[i])
i += 1
print total_sum
print mid
print base
print global_percent_change
print global_raw_star_avg
print global_raw_stars
print hn_event_list_size
try:
global_delta_growth = ((total_sum - mid)/mid - (mid - base)/base)/((mid - base)/base)
print "successfully created global_delta_growth to be: %s" % global_delta_growth
except:
infinite_growth += 1
print "infinite growth"
continue
inner_db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="root", # your username
passwd="password", # your password
db="githubDB") # name of the data base
#write new data to database
print "Writing to chart_table ...\n"
inner_cur = inner_db.cursor()
i = 0
while i < 15: #iterates through the days
inner_cur.execute("""INSERT INTO chart_table (day, slider_stars, slider_hn_points, daily_total_stars, daily_growth, change_in_growth)
VALUES (%s, %s, %s, %s, %s, %s)""", (i, star_val, point_val, global_raw_star_avg[i], global_percent_avg[i]*100, global_delta_growth))
i += 1
inner_db.commit()
inner_cur.close()
inner_db.close()
big_event_count += 1
print '\n\n\n--------------------------------------'
print "data insertion complete\n"
print "Summary for %s hn_points and %s stars." % (point_val, star_val)
print "global raw star average is: %s" % (global_raw_star_avg)
print "global percent average is: %s" % (global_percent_avg)
print "global delta growth is: %s" % (global_delta_growth)
print "completed data set #%s of %s" % (event_num, len(hn_point_params) * len(gh_point_params))
print "total elapsed time is: %s seconds" % int(time.time() - start_time)
print "average time per overall request is: %s seconds" % int((time.time() - start_time)/completed_events)
print "expected time remaining is: %s" % (int((time.time() - start_time)/completed_events) * int(len(hn_point_params) * len(gh_point_params)))
print '\n\n\n--------------------------------------'
except Exception, e:
print e
print 'Keyboard interupt received.'
finally:
print '\n--------------------------------------'
print '--------------------------------------'
print "start: %s" % start
print "stop: %s" % stop
print "big event completed #%s" % big_event_count
print "event_num: %s" % event_num
print "infinite_growth occured %s times" % infinite_growth
print "total elapsed time: %s" % (time.time() - start_time)
print '\n--------------------------------------'
cur.close()
db.close()