-
Notifications
You must be signed in to change notification settings - Fork 0
/
student_times_reducer.py
54 lines (45 loc) · 1.87 KB
/
student_times_reducer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python
import csv
import sys
def reducer():
# Using CSV reader and writer to simplify input/output
reader = csv.reader(sys.stdin, delimiter='\t')
writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
highCount = -1
lastKey = None
hourlyCounts = {} # create a dictionary for hourly counts
for line in reader:
if len(line) != 2: # skip bad data
continue
theKey = line[0]
theHour = line[1]
if lastKey and theKey != lastKey:
# Write records for lastKey if theKey has changed
write_row(writer, lastKey, highCount, hourlyCounts)
# Reset highCount and hourlyCounts
highCount = -1
hourlyCounts = {}
# Determine if the current key exists in the dictionary
if theHour in hourlyCounts:
# Increment the hourlyCount for theHour if it is in the dictionary
hourlyCounts[theHour] += 1
else:
# Set the hourlyCount to 1 if theHour is not in the dictionary
hourlyCounts[theHour] = 1
# Determine if the highCount needs to be set to current count value
if hourlyCounts[theHour] > highCount:
highCount = hourlyCounts[theHour]
lastKey = theKey
# Must ensure the last record is written
if lastKey != None:
write_row(writer, lastKey, highCount, hourlyCounts)
# This function will write the record(s) with the highest hourly count
def write_row(writer, lastKey, highCount, hourlyCounts):
# Loop through the dictionary and write records that match the high count
for hour, count in hourlyCounts.iteritems():
if hourlyCounts[hour] == highCount:
writer.writerow([lastKey, hour])
# The following is to be able to test outside of hadoop
def main():
reducer()
main()