\\n', not\n%s"%last_tok(s))
+ if not match:
+ return '', []
+ division = match.group(1)
+ s = s[match.end():]
+ # print("got DIVISION: '%s' in dept '%s'"%(division, dept))
+
+ courses = []
+ while s:
+ s, result = parse_course(s, dept=dept, division=division, dept_lookup=dept_lookup)
+ if result:
+ courses.append(result)
+ else:
+ break
+ return s, courses
+
+def parse_course_page (page, dept_lookup):
+ text = page.content
+ courses = []
+ while text:
+ text, result = parse_division(text, dept=page.dept, dept_lookup=dept_lookup)
+ # if result:
+ # print("Parsed %s courses from %s (%s)"%(len(result), page.dept, result[0].division))
+ courses += result
+ return courses
+
+def fixup_course_lookup (lookup):
+ lookup['Chemistry'] = lookup['Chemistry and Biochemistry']
+
+def parse_course_pages (*args, **kwargs):
+ pages = list(fetch_course_pages(*args, **kwargs))
+ dept_lookup = {}
+ for page in pages:
+ dept_lookup[page.title] = page.dept
+ fixup_course_lookup(dept_lookup)
+
+ # print("Dept lookup:")
+ items = sorted(list(dept_lookup.items()), key=lambda x: x[0])
+ for i, (title, dept) in enumerate(items):
+ print("\t%d\t'%s': '%s'"%(i, title, dept))
+
+ for page in pages:
+ for result in parse_course_page(page, dept_lookup=dept_lookup):
+ yield result
+
+if __name__ == '__main__':
+ with open('prereqs', 'w') as f:
+ f.write('')
+ with open('unparsed', 'w') as f:
+ f.write('')
+
+ courses = list(parse_course_pages())
+ # print("Parsed %s courses"%len(courses))
+
+ byDept = {}
+ byDiv = {}
+ for course in courses:
+ if not course.dept in byDept:
+ byDept[course.dept] = []
+ if not course.division in byDiv:
+ byDiv[course.division] = []
+ byDept[course.dept].append(course)
+ byDiv[course.division].append(course)
+
+ # print("Courses by department:")
+ # for dept, courses in byDept.items():
+ # print("\t%s: %s course(s)"%(dept, len(courses)))
+
+ # print("Courses by division:")
+ # for div, courses in byDiv.items():
+ # print("\t%s: %s course(s)"%(div, len(courses)))
+
+
+ # print(fetch_course_pages())
+ # map(parse_course_page, fetch_course_pages())
diff --git a/crawlers/ucsc/prereq_parser.py b/crawlers/ucsc/prereq_parser.py
new file mode 100644
index 0000000..b38bbbe
--- /dev/null
+++ b/crawlers/ucsc/prereq_parser.py
@@ -0,0 +1,285 @@
+import re
+
+def unspacify (s):
+ return ''.join([ w.strip() for w in s.strip().split('\n') ])
+
+tokenizer = re.compile(unspacify(r'''
+ ([Cc]ourses?)|
+ (:|
+ \s+in\s+|\s+a\s+|\s+from\s+|\s+is\s+|\s+for\s+|
+ (?:[,;]?\s+(?:and\s+)?)?[Ss]atisfaction\s+of\s+(?:the\s+)?Entry\s+Level\s+Writing(?:\s+and\s+Composition)?(?:\s+[Rr]equirements?)?|
+ (?:pass\s+)?swimming(?:\s+ability|\s+skills\s+tests?\s+and\s+medical\s+clearance)|
+ (?:graduate|upper).+standing|
+ open\s+to\s+graduate\s+students|
+ undergrads.+instructor|
+ restricted.+students|
+ completion.+requirements?|
+ enrollment.+members?|
+ enrolled.+meeting|
+ score.+MPE\)|
+ score.+higher|
+ score.+of\s+\d+|
+ equivalent|
+ skills|
+ math.+background|
+ an\s+Undergraduate\s+Research\s+Contract.+department|
+ the\s+following:|
+ submission.+process|
+ proposal.+supervise|
+ approval.+major|
+ approval.+preceptor|
+ College.+Writing|
+ Completion|
+ satisfaction.+requirements?|
+ permission.+department|
+ intro.+classroom|
+ for\s+an\s+understanding.+program|
+ acceptance.+program|
+ skill.+test|
+ satisfaction.+requirement|
+ in\s+the.+Program|
+ (?:completing|enrollment).+instructor|
+ basic.+assumed|
+ consent.+(?:instructor|coordinator)|
+ is.+course|
+ prior.+enrollment\s+in|
+ highly.+preparation|
+ essay.+life|
+ intro.+tion|
+ by.+coordinator|
+ college.+approval|approval.+college|
+ suggested|
+ college-level.+coursework|
+ students.+instructor|
+ previous.+enrollment\s+in|
+ (?:is\s+restricted\s+to\s+)(?:feminist|psychology).+majors|
+ (?:a\s+)score.+(?:higher|MPE\))|
+ selection.+work|
+ enrollment.+interview|
+ high\s+school.+recommended|
+ basic\s+college|
+ in\s+ocean.+recommended|
+ no.+quarter|
+ core|
+ university.+biology|
+ operational.+language|
+ interview.+meeting|
+ must.+C\+\+|
+ introductory\s+statistics\s+course\s+\(.+ent\)|
+ \(|\)|
+ research.+department|
+ (?:or\s+)?(?:by\s+)?permission.+instructor|
+ interview.+project|
+ upper.+(?:recommended|supervise)|
+ sumbission.+process|
+ prior.+major|
+ placement\s+by\s+examination|
+ at\s+least\s+one\s+astronomy\s+course|
+ \(or\s+equivalent\)|
+ high-school\s+level\s+chemistry|
+ pass(?:\s+in)?Swimming\s+Level\s+I+\s+course.+skills|
+ (?:in\s+)freestyle.+breaststroke|
+ (?:by\s+)?(?:consent|permission)?(?:\s+of(?:\s+the)?\s+instructor)?|
+ instructor\s+determin(?:ation|es\s+skill\s+level)\s+at\s+first\s+class\s+meeting|
+ [Bb]asic\s+knowledge\s+of\s+computer\s+programming\s+languages\s+is\s+assumed|
+ basic\s+rowing|
+ more\s+hours\s+of\s+club\s+keelboat\s+useage|
+ advancement.+agency|
+ (?:instructor ?)determination\s+at\s+first\s+class\s+meeting|
+ a\s+writing.+meeting|
+ intended.+only|
+ mathematics\s+placement.+higher|
+ interview.+materials|
+ students.+agency|
+ pass.+skills|
+ interview.+preparedness|
+ work.+interview|
+ (?:a\s+)proposal.+supervise|
+ instructor.+permission|
+ open\s+only\s+Press|
+ instructor.+level|
+ certification.+clearance|
+ special.+instructor|
+ completion.+LA|
+ interview.+only|
+ excellent.+courses|
+ enrollment.+majors|
+ instructor.+required|
+ for.+perission(?:.+enroll)?|
+ or\s+.+equivalent|
+ enroll.+seniors|
+ concurrent.+enrollment|
+ basic.+Fortran|
+ calculus.+algebra|
+ instructor.+approval|
+ A\s+background.+programming|
+ satisfactory.+exam|
+ must.+(?:book|skills)|
+ priority.+concentration|
+ another\s+screenwriting\s+course|
+ petition.+concentration\)?|
+ history.+seminar|
+ (?:one\s+year|years\s+of).+language|
+ qualifications.+meeting|
+ equivalent\s+skills|
+ interview.+portfolio|
+ (?:(?:a.+)?placement|AWPE).+score\s+of\s+\d+|
+ taking.+recommended|
+ approval\s+of\s+the\s+Writing\s+Program|
+ [Pp]revious(?:\s+course\s+in\s+ocean\s+sciences)?|
+ Basic\s+Scuba\s+Certification|
+ Scuba|
+ in\s+Oakes|
+ approval.+provost|
+ current.+leader|
+ (?:a\s+)score\s+of\s+.+\(MPE\)|
+ (?:one|two)\s+upper-division\s+history\s+courses|
+ journalism\s+experience|
+ (?:the\s+)?equivalent|
+ essay.+member|
+ a\s+proposal.+supervise|
+ (?:determination|admission|audition).+meeting|
+ placement\s+by\s+interview|
+ proficiency\s+in\s+French|
+ participation.+ACE|
+ good\s+academic\s+standing|
+ pass.+swimming|
+ AP.+(?:higher|\d+)|
+ one.+studies|
+ enrollment\s+in|
+ is\s+required|
+ open.+Press|
+ freestyle.+breaststroke|
+ certification.+Program|
+ consent.+instructor|
+ Successful|
+ the.+Program|
+ satisfaction.+requirements|
+ one.+additional.+course|
+ required|experience|
+ must.+concurrently|
+ are.+recommended|
+ an.+department|
+ \s+any\s+|
+ of.+the.+following|
+ permission.+department|
+ Entry.+requirements|
+ successful.+core|
+ at\s+least.+cour?ses|
+ score.+\(MPE\)|
+ of|
+ score\s+of\s+\d+or\s+higher\s+on\s+the\s+mathematics\s+placement\s+examination\s+\(MPE\)|
+ (?:is\s+)?(?:are\s+)?(?:strongly\s+)?recommended(?:\s+as\s+preparation)?|
+ (?:is\s+)?[Rr]equire(?:d|ment)
+ (?:enrollment.+|is.+restricted.+)?(?:seniors?|upper-division|graduate(?:\s+students?))(?:.+standing)?|
+ higher|requirements|university.level.+biology|as.preparation|preferred|\(|\)|previous.or.concurrent.enrollment.in|ocean|[Ee]arth|
+ intro.+tion|
+ with.+adviser|
+ highly.+this.course|
+ prior.+this.course|
+ sub.+supervise|
+ work.+enroll|
+ to.enroll|
+ sciences.is.+recommended|
+ non.sculpture.+studios.from|
+ non.print.+studios.from|
+ non.painting.+studios.from|
+ non.photography.+studios.from|
+ from:|
+ per.+permission|
+ probability.+background|
+ basic.+systems|
+ qualifications.+inquire.+office|
+ or.by.permission.+instructor|
+ familiarity.+C\+\+|
+ exceptions.+instructor|
+ computer.+elective|
+ intro.CAL.+classroom|
+ an.understanding.+program|
+ grade.+better.in|
+ are.required|
+ per.+permission|
+ exception.+instructor|
+ restricted.+majors|
+ intro.+tion|
+ restricted.+seniors|
+ psychology.+majors|
+ upper.+course|
+ as.+course|
+ a.university.level.+instructor|
+ as.prereq.+course|
+ knowledge.+language|
+ engagement.+research|
+ petition.+agency|
+ proof.+writing|
+ see.+information|
+ admission.+audition|
+ strong.+recommended|
+ application.+letter|
+ folklore.+recommended|
+ sponsoring.+approval|
+ advancement.to.candidacy|
+ instructoazr|
+ for.+majors|
+ a.+recommended|
+ at.+language.+equivalent|
+ knowledge.+language|
+ instructor|
+ petition.+agency|
+ preparation|
+ at.+following:|
+ determination.+application;|
+ a.college.level.calculus.course|
+ intro.Spanish.+Examination|
+ )|
+ (\s+)|
+ ((?:[A-Z][a-z]+(?:\s+and)?[\s/]+)*[A-Z][a-z]+|[A-Z]+)|
+ (\d+[A-Z]?(?:[/-][A-Z])*)|
+ ([;,]\s*(?:and|or)?)|
+ (and|or)|
+ ([Oo]ne|[Tt]wo)|
+ (concurrent\s+enrollment\s+in)|
+ (required)|
+ (either)|
+ (.+)
+'''), re.DOTALL | re.VERBOSE)
+
+assert(re.match(tokenizer, 'satisfaction of the Entry Level Writing and Composition requirements'))
+assert(re.match(tokenizer, 'permission of instructor'))
+assert(re.match(tokenizer, 'permission of the instructor'))
+
+
+def parse_prereqs (prereqs, dept, depts):
+ # print("Parsing '%s'"%prereqs)
+ depts['course'] = dept
+ depts['courses'] = dept
+ course_prefix = "N/A "
+ for match in re.finditer(tokenizer, prereqs):
+ (course_keyword,
+ ignore,
+ whitespace,
+ course, number,
+ delims,
+ and_or,
+ one_from,
+ concurrent,
+ required,
+ either,
+ error
+ ) = match.groups()
+ if error:
+ with open ('unparsed', 'a') as f:
+ f.write(error+'\n')
+ print("unparsed: '%s'"%error)
+ # raise Exception("unmatched token(s) '%s' in '%s'"%(error, prereqs))
+ elif course:
+ course = course.strip()
+ try:
+ course_prefix = '%s '%depts[course].upper()
+ except KeyError:
+ pass
+ # print("Unhandled course: '%s'"%course)
+ elif number:
+ pass
+ # print(course_prefix+number)
+
diff --git a/crawlers/ucsc/ucsc_registrar_crawler.py b/crawlers/ucsc/ucsc_registrar_crawler.py
new file mode 100644
index 0000000..50f6f85
--- /dev/null
+++ b/crawlers/ucsc/ucsc_registrar_crawler.py
@@ -0,0 +1,68 @@
+from bs4 import BeautifulSoup
+from urllib2 import urlopen
+from pprint import pprint
+import re
+
+def fetch_html (url, process_callback):
+ response = urlopen(url)
+ return process_callback(BeautifulSoup(response.read(), 'html.parser'))
+
+
+def enforce (condition, msg, *args):
+ if not condition:
+ raise Exception(msg % args)
+
+
+def process_registrar_page_content (url, callback):
+ def process (soup):
+ top = soup.find(id='top')
+ enforce(top, "Could not find 'top' element in page at '%s':%s",
+ url, soup.prettify())
+ content = top.parent.parent
+ enforce('content' in content['class'],
+ "Expected #top to be nested within
, not\n%s",
+ content.prettify() if content else '', soup.prettify())
+ return callback(content)
+ return fetch_html(url, process)
+
+def filterMapRegex (items, regex, groups = (1)):
+ for item in items:
+ match = re.match(regex, item)
+ if match:
+ yield match.group(*groups)
+
+def process_registrar_course_page (dept):
+ dept = dept.upper()
+ prefix = dept + ' '
+ courses = {}
+ def parse_course (name, text):
+ items = text.split('.')
+ courses[name] = { 'dept': dept }
+ if len(items) > 0:
+ courses[name]['title'] = items[0]
+ items = items[1:]
+ if len(items) > 0:
+ match = re.match(r'\s*([FWS](?:,[FWS])*|\*)\s+', items[0])
+ enforce(match, "Could not match terms in '%s'", items[0])
+ courses[name]['terms'] = match.group(1).replace(',','')
+ courses[name]['instructor'] = items[-1]
+ items = items[:-1]
+ if len(items) > 0:
+ courses[name]['description'] = '.'.join(items)
+
+ def process (content):
+ text = content.text
+ text = re.sub(r'\.([\)"]+)', r'\1.', text)
+ items = filterMapRegex(text.split('\n'),
+ r'(\d+[A-Z]?)\.\s+([^\n]+)', (1, 2))
+ for courseId, rest in items:
+ parse_course(prefix + courseId, rest)
+ return courses
+ return process
+
+if __name__ == '__main__':
+ result = process_registrar_page_content(
+ 'https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses/course-descriptions/math.html',
+ process_registrar_course_page('math'))
+
+ pprint(result)
diff --git a/crawlers/ucsc_old/scrapy.cfg b/crawlers/ucsc_old/scrapy.cfg
new file mode 100644
index 0000000..7bed386
--- /dev/null
+++ b/crawlers/ucsc_old/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = ucsc.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = ucsc
diff --git a/crawlers/ucsc_old/ucsc/__init__.py b/crawlers/ucsc_old/ucsc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawlers/ucsc_old/ucsc/architecture.py b/crawlers/ucsc_old/ucsc/architecture.py
new file mode 100644
index 0000000..a5f952c
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/architecture.py
@@ -0,0 +1,200 @@
+import scrapy
+import re
+''' Crappy initial implementation. Minimum required to run / pass. Can add useful features later. '''
+
+
+class BaseCrawler:
+ pass
+
+class SelectorWrapper:
+ def __init__ (self, value):
+ self.value = value
+
+ def xpath_require_one (self, selection):
+ if self.value:
+ result = self.value.xpath(selection)
+ if result is None or len(result) > 1:
+ raise Exception("Expected single selection with '%s', got '%s', prev selection:\n%s"%(
+ selection, result, self.value.extract()))
+ return SelectorWrapper(result)
+ return self
+
+ def xpath_require_many (self, selection):
+ if self.value:
+ result = self.value.xpath(selection)
+ if result is None:
+ raise Exception("Expected 1+ selection(s) with '%s', got '%s', prev selection:\n%s"%(
+ selection, result, self.value.extract()))
+ return SelectorWrapper(result)
+ return self
+
+ def map_async (self, callback):
+ if not self.value:
+ callback(self)
+ else:
+ for entry in self.value:
+ callback(SelectorWrapper(entry))
+
+ def xpath_stripped_text (self, selection=None, strip=None):
+ if self.value:
+ selection = '%s/text()'%selection if selection else 'text()'
+
+ result = self.value.xpath(selection)
+ result = result.extract() if result else result
+ if result is None:# or len(result) != 1:
+ raise Exception("Expected text(), in selection '%s', got '%s' in:\n%s"%(
+ selection, result, self.value.extract()))
+ return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip())
+ return self
+
+ def xpath_attrib (self, selection, strip=None):
+ if self.value:
+ result = self.value.xpath(selection)
+ result = result.extract() if result else result
+ if result is None or len(result) != 1:
+ raise Exception("Expected attrib '%s', got '%s' in:\n%s"%(
+ selection, result, self.value.extract()))
+ return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip())
+ return self
+
+
+
+ def bind (self, result, attrib):
+ if self.value:
+ value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \
+ else self.value.extract()[0]
+ if type(attrib) == str or type(attrib) == unicode:
+ result[attrib] = self.value
+ elif type(attrib) == tuple:
+ for k in attrib:
+ result[k] = self.value
+ else:
+ raise Exception("Invalid argument passed to %s.bind(): %s %s"%(
+ type(self), type(attrib), attrib))
+ else:
+ result[attrib] = None
+ print("Failed to assign attrib '%s' to %s in %s"%(
+ attrib, type(result[attrib]), type(result)))
+
+ def equals (self, other):
+ # if (type(self.value) == str or type(self.value) == unicode) == (type(other) == str or type(other) == unicode):
+ # pass
+ # if type(self.value) != type(other):
+ # raise Exception("%s.equals() attempting to compare conflicting types: %s and %s"%(
+ # type(self), type(self.value), type(other)))
+ return self.value == other
+
+ def matches_re (self, regex):
+ if not self.value:
+ raise Exception("Attempting to do regex match on null result")
+
+ if type(self.value) == str or type(self.value) == unicode:
+ return re.match(regex, self.value) is not None
+ return self.value.re(regex) is not None
+
+ def contains (self, other):
+ if type(self.value) == str or type(self.value) == unicode:
+ return other in self.value
+ return self.value.contains(other)
+
+ def bind_re (self, regex, result, attrib):
+ if self.value:
+ try:
+ value = self.value.extract()[0]
+ except AttributeError:
+ value = self.value
+ # value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \
+ # else self.value.extract()[0]
+
+ match = re.match(regex, self.value)
+ if not match:
+ raise Exception("Failed to match regex '%s' against input %s"%(
+ match, value))
+
+ if type(attrib) == str or type(attrib) == unicode:
+ result[attrib] = match.group(1)
+ elif type(attrib) == tuple:
+ for i, k in enumerate(attrib):
+ result[k] = match.group(i+1)
+ else:
+ raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%(
+ type(self), type(attrib), attrib))
+ else:
+ result[attrib] = None
+ print("Failed to assign attrib '%s' to %s in %s"%(
+ attrib, type(result[attrib]), type(result)))
+
+ def bind_re_map (self, regex, result, attrib, transform):
+ if self.value:
+ value = self.value if type(self.value) == str or type(self.value) == int or type(self.value) == unicode \
+ else self.value.extract()[0]
+
+ match = re.match(regex, value)
+ if not match:
+ raise Exception("Failed to match regex '%s' against input %s"%(
+ regex, value))
+
+ if type(attrib) == str:
+ result[attrib] = transform(match.group(1))
+ elif type(attrib) == tuple:
+ for i, (k, f) in enumerate(zip(attrib, transform)):
+ result[k] = f(match.group(i+1))
+ else:
+ raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%(
+ type(self), type(attrib), attrib))
+ else:
+ result[attrib] = None
+ print("Failed to assign attrib '%s' to %s in %s"%(
+ attrib, type(result[attrib]), type(result)))
+
+ def to_int (self):
+ if self.value:
+ return SelectorWrapper(int(self.value))
+ return self
+
+ def request_async_crawl (self, crawler=None, url=None):
+ assert(crawler is not None and url is not None)
+
+
+ def map_sequential_cases (self, selection=None, check='maybe', cases=None):
+ assert(check in set(('yes', 'no', 'maybe')))
+ assert(cases is not None)
+ assert(type(cases) == tuple)
+ assert(type(cases[0]) == tuple)
+ assert(type(cases[0][0]) == str)
+
+ do_check = check != 'no'
+ if not self.value:
+ for req, test, applicator in cases:
+ applicator(self)
+ else:
+ results = self.value.xpath(selection) if selection else self.value
+ i = 0
+ for item in results:
+ result = SelectorWrapper(item)
+ if i > len(cases):
+ print("Too few items to match all cases")
+ return
+ if do_check and not cases[i][1](result):
+ if cases[i][0] == 'required':
+ raise Exception("Failed map_sequential_cases case test (%d):\n%s"%(
+ i, result))
+ else:
+ cases[i][2](result)
+ i += 1
+ if i < len(cases):
+ print("Did not visit all items")
+
+
+def item_producer (Item):
+ def decorator (fcn):
+ def wrapper (self, request):
+ result = Item()
+ fcn(self, request, result)
+ return wrapper
+ return decorator
+
+def parser_entrypoint (fcn):
+ def wrapper (self, request):
+ return fcn(self, SelectorWrapper(request))
+ return wrapper
diff --git a/crawlers/ucsc_old/ucsc/items.py b/crawlers/ucsc_old/ucsc/items.py
new file mode 100644
index 0000000..f06eff9
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/items.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+from scrapy.item import Item, Field
+
+class ProgramStatementItem(scrapy.Item): # not very important
+ url = Field()
+ title = Field()
+ program_statement = Field()
+ raw_page_content = Field()
+
+class CourseDescriptionItem(scrapy.Item): # VERY IMPORTANT
+ url = Field()
+ dept = Field()
+ dept_title = Field()
+ course_numnber = Field()
+ course_title = Field()
+ quarters_offered = Field()
+ course_description = Field()
+
+class FacultyItem(scrapy.Item): # would be nice to have
+ url = Field()
+ name = Field() # before ',' (recommend .split(',')[0])
+ title = Field() # everything after ',' (recommend .split(',')[1:])
+ statement = Field() # optional
+
+
+class PisaIndexItem(scrapy.Item):
+ """ Encapsulates all the data visible from a Pisa course listing on pisa.ucsc.edu/class_search/index.php """
+ url = Field() # url of class page, eg. "https://pisa.ucsc.edu/class_search/index.php/index.php?action=detail&class_data=YToyOntzOjU6IjpTVFJNIjtzOjQ6IjIxODgiO3M6MTA6IjpDTEFTU19OQlIiO3M6NToiMjE3MjMiO30%3D"
+ course_name = Field() # string, eg. "AMS 03"
+ course_title = Field() # string, eg. "Precalculus"
+ course_section = Field() # string, eg. "01"
+ class_number = Field() # int, eg. 21723
+ instructor = Field() # string, eg. "Garaud,P."
+ class_type = Field() # "LEC", "LAB", or "SEM" (or "DISC"...?)
+ location = Field() # string, eg. "Soc Sci 2 075"
+ meet_times = Field() # string, eg. "MWF 10:40AM-11:45AM"
+ enroll_max = Field() # int
+ enroll_current = Field() # int
+ materials_url = Field() # link to materials page, eg. "http://ucsc.verbacompare.com/comparison?id=FL18__AMS__003__01"
+ term = Field() # TBD, eg. "Fall 2018"
+ term_id = Field() # TBD, integer id used when searching via form
+
+
+class PisaCourseItem(scrapy.Item):
+ """ Encapsulates all the data visible from a class page; TBD """
+ url = Field() # url of class page, eg. "https://pisa.ucsc.edu/class_search/index.php/index.php?action=detail&class_data=YToyOntzOjU6IjpTVFJNIjtzOjQ6IjIxODgiO3M6MTA6IjpDTEFTU19OQlIiO3M6NToiMjE3MjMiO30%3D"
+ course_name = Field() # string, eg. "AMS 03"
+ course_title = Field() # string, eg. "Precalculus"
+ course_section = Field() # string, eg. "01"
+ class_number = Field() # int, eg. 21723
+ lecture_number = Field() # int, class_number of lecture component (or class_number)
+ instructor = Field() # string, eg. "Garaud,P."
+ class_type = Field() # "LEC", "LAB", or "SEM" (or "DISC"...?)
+ class_type_pretty = Field() # "Lecture", ...
+ location = Field() # string, eg. "Soc Sci 2 075"
+ meet_times = Field() # string, eg. "MWF 10:40AM-11:45AM"
+ enroll_max = Field() # int
+ enroll_current = Field() # int
+ materials_url = Field() # link to materials page, eg. "http://ucsc.verbacompare.com/comparison?id=FL18__AMS__003__01"
+ term = Field() # eg. "Fall 2018"
+ term_id = Field() # integer id used when searching via form
+ career_type = Field()
+ grading_options = Field()
+ credits = Field()
+ gen_ed_categories = Field()
+ waitlist_max = Field()
+ waitlist_current = Field()
+
+ course_description = Field() # Description text
+ enrollment_reqs = Field() # Enrollment text
+ class_notes = Field() # Class notes text
+ class_dates = Field()
diff --git a/crawlers/ucsc_old/ucsc/middlewares.py b/crawlers/ucsc_old/ucsc/middlewares.py
new file mode 100644
index 0000000..8697293
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class UcscSpiderMiddleware(object):
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the spider middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_spider_input(self, response, spider):
+ # Called for each response that goes through the spider
+ # middleware and into the spider.
+
+ # Should return None or raise an exception.
+ return None
+
+ def process_spider_output(self, response, result, spider):
+ # Called with the results returned from the Spider, after
+ # it has processed the response.
+
+ # Must return an iterable of Request, dict or Item objects.
+ for i in result:
+ yield i
+
+ def process_spider_exception(self, response, exception, spider):
+ # Called when a spider or process_spider_input() method
+ # (from other spider middleware) raises an exception.
+
+ # Should return either None or an iterable of Response, dict
+ # or Item objects.
+ pass
+
+ def process_start_requests(self, start_requests, spider):
+ # Called with the start requests of the spider, and works
+ # similarly to the process_spider_output() method, except
+ # that it doesn’t have a response associated.
+
+ # Must return only requests (not items).
+ for r in start_requests:
+ yield r
+
+ def spider_opened(self, spider):
+ spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class UcscDownloaderMiddleware(object):
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the downloader middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_request(self, request, spider):
+ # Called for each request that goes through the downloader
+ # middleware.
+
+ # Must either:
+ # - return None: continue processing this request
+ # - or return a Response object
+ # - or return a Request object
+ # - or raise IgnoreRequest: process_exception() methods of
+ # installed downloader middleware will be called
+ return None
+
+ def process_response(self, request, response, spider):
+ # Called with the response returned from the downloader.
+
+ # Must either;
+ # - return a Response object
+ # - return a Request object
+ # - or raise IgnoreRequest
+ return response
+
+ def process_exception(self, request, exception, spider):
+ # Called when a download handler or a process_request()
+ # (from other downloader middleware) raises an exception.
+
+ # Must either:
+ # - return None: continue processing this exception
+ # - return a Response object: stops process_exception() chain
+ # - return a Request object: stops process_exception() chain
+ pass
+
+ def spider_opened(self, spider):
+ spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/crawlers/ucsc_old/ucsc/pipelines.py b/crawlers/ucsc_old/ucsc/pipelines.py
new file mode 100644
index 0000000..2208d67
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class UcscPipeline(object):
+ def process_item(self, item, spider):
+ return item
diff --git a/crawlers/ucsc_old/ucsc/settings.py b/crawlers/ucsc_old/ucsc/settings.py
new file mode 100644
index 0000000..0a18688
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/settings.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for ucsc project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+# https://doc.scrapy.org/en/latest/topics/settings.html
+# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'ucsc'
+
+SPIDER_MODULES = ['ucsc.spiders']
+NEWSPIDER_MODULE = 'ucsc.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'ucsc (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+# 'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+# 'ucsc.middlewares.UcscSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+# 'ucsc.middlewares.UcscDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+# 'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+# 'ucsc.pipelines.UcscPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+AUTOTHROTTLE_DEBUG = False
+#CONCURRENT_REQUESTS_PER_IP
+CONCURRENT_REQUESTS_PER_IP = 5
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/crawlers/ucsc_old/ucsc/spiders/__init__.py b/crawlers/ucsc_old/ucsc/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/crawlers/ucsc_old/ucsc/spiders/pisa.py b/crawlers/ucsc_old/ucsc/spiders/pisa.py
new file mode 100644
index 0000000..714feec
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/spiders/pisa.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import logging
+import re
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+from ucsc.items import PisaIndexItem, PisaCourseItem
+
+site_path = lambda path: '{}/{}'.format(
+ 'https://pisa.ucsc.edu/class_search', path)
+
+def parse_course_title (text, result):
+ assert(text)
+ match = re.match(r'\s*(\w+\s+\d+[A-Z]?)[^\d]+(\d+)[^\w]+([^\n]+)', text)
+ if not match:
+ raise Exception("Failed to parse '%s'"%text)
+ result['course_name'] = match.group(1)
+ result['course_section'] = match.group(2)
+ result['course_title'] = match.group(3).strip()
+
+class PisaSpider(scrapy.Spider):
+ name = 'pisa'
+ allowed_domains = ['pisa.ucsc.edu']
+ search_url = site_path('index.php')
+ start_urls = [ search_url ]
+
+ def __init__(self, *args, **kwargs):
+ logger = logging.getLogger('scrapy.spidermiddlewares.httperror')
+ logger.setLevel(logging.WARNING)
+ super(PisaSpider, self).__init__(*args, **kwargs)
+
+ self.max_index_scrapes = -1
+ self.max_page_scrapes = -1
+ self.pages_total = 0
+ self.pages_done = 0
+
+ def parse(self, response):
+ yield scrapy.FormRequest(url=self.search_url,
+ formdata={'action':'results',
+ 'binds[:term]':'2188',
+ 'binds[:reg_status]':'all',
+ 'binds[:subject]':'',
+ 'binds[:catalog_nbr_op]':'=''',
+ 'binds[:catalog_nbr]':'',
+ 'binds[:title]':'',
+ 'binds[:instr_name_op]':'=''',
+ 'binds[:instructor]':'',
+ 'binds[:ge]':'',
+ 'binds[:crse_units_op]':'=''',
+ 'binds[:crse_units_from]':'',
+ 'binds[:crse_units_to]':'',
+ 'binds[:crse_units_exact]':'',
+ 'binds[:days]':'',
+ 'binds[:times]':'',
+ 'binds[:acad_career]':'',
+ 'binds[:session_code]':'',
+ 'rec_start': '0',
+ 'rec_dur': '1582'},
+ callback=self.parse_course_index)
+
+ def parse_course_index(self, response):
+ if self.max_index_scrapes == 0:
+ return
+
+ print("Parsing index '%s'"%response.url)
+ items = response.xpath('body/div[contains(@class,"center-block")]/div[@class="panel-body"]/div[contains(@id,"rowpanel")]')
+ assert(items)
+ for item in items:
+ if self.max_index_scrapes == 0:
+ return
+ self.max_index_scrapes -= 1
+
+ result = PisaIndexItem()
+ anchor = item.xpath('div[contains(@class,"panel-heading")]/h2/a[contains(@id,"class_id_")]')
+ assert(anchor)
+ result['url'] = site_path(anchor.xpath('@href').extract()[0])
+
+ # Temporarily disabled; this IS valid index data
+ #
+ # parse course name, title, section
+ # parse_course_title(anchor.xpath('text()').extract()[0], result)
+
+ # grab class number + enrollment info
+ # rest = item.xpath('div[contains(@class,"panel-body")]/div[contains(@class,"row")]')
+ # assert(rest)
+ # result['class_number'] = int(rest.xpath('div[1]/a/text()').extract()[0])
+ # result['instructor'] = rest.xpath('div[2]/text()').extract()[0].strip()
+ # location_info = rest.xpath('div[3]/text()')
+ # result['class_type'], result['location'] = location_info.re(r'\s*([A-Z]+):\s+([\s\w]+)')
+
+ # result['meet_times'] = rest.xpath('div[4]/text()').extract()[0].strip()
+ # enroll_info = rest.xpath('div[5]/text()')
+ # result['enroll_current'], result['enroll_max'] = map(int, enroll_info.re(r'\s*(\d+)\s+of\s+(\d+)'))
+ # result['materials_url'] = rest.xpath('div[6]/a/@href').extract()[0]
+
+ # yield result
+ # print("Sending crawl request for '%s'"%result['url'])
+ if self.max_page_scrapes != 0:
+ self.max_page_scrapes -= 1
+ yield scrapy.Request(result['url'], callback=self.parse_course_page)
+ self.pages_total += 1
+ print("%d / %d"%(self.pages_done, self.pages_total))
+
+
+ def parse_course_page(self, response):
+ result = PisaCourseItem()
+ content = response.xpath('body/div[contains(@class,"panel")]/div[contains(@class,"panel-body")]')
+ assert(content)
+
+ parse_course_title(content.xpath('div[1]/div/h2/text()').extract()[0], result)
+ result['term'] = content.xpath('div[2]/div/text()').extract()[0].strip()
+
+ def parse_panel_class_details (panel_body):
+ details = panel_body.xpath('div[contains(@class,"row")]')
+ left_panel, right_panel = details.xpath('div[1]/dl'), details.xpath('div[2]/dl')
+ result['career_type'] = left_panel.xpath('dd[1]/text()').extract()[0].strip('"')
+ result['grading_options'] = left_panel.xpath('dd[2]/text()').extract()[0].strip('"')
+ result['class_number'] = int(left_panel.xpath('dd[3]/text()').extract()[0].strip('"'))
+ result['lecture_number'] = result['class_number']
+ class_type = left_panel.xpath('dd[4]/text()').extract()[0].strip('"')
+ try:
+ result['class_type'] = {
+ 'Lecture': 'LEC',
+ 'Discussion': 'DISC',
+ 'Seminar': 'SEM',
+ 'Laboratory': 'LAB',
+ 'Field Studies': 'FLD',
+ 'Studio': 'fixme (Studio)',
+ }[class_type]
+ except KeyError:
+ print("FIXME unhandled class type: '%s'"%class_type)
+ # raise Exception("Unhandled class_type: '%s'"%class_type)
+ result['credits'] = left_panel.xpath('dd[5]/text()').extract()[0].strip('"')
+ result['gen_ed_categories'] = left_panel.xpath('dd[5]/text()').extract()[0].strip('"')
+ avail_seats = int(right_panel.xpath('dd[2]/text()').extract()[0].strip('"'))
+ result['enroll_max'] = int(right_panel.xpath('dd[3]/text()').extract()[0].strip('"'))
+ result['enroll_current'] = int(right_panel.xpath('dd[4]/text()').extract()[0].strip('"'))
+ result['waitlist_max'] = int(right_panel.xpath('dd[5]/text()').extract()[0].strip('"'))
+ result['waitlist_current'] = int(right_panel.xpath('dd[6]/text()').extract()[0].strip('"'))
+ # assert(avail_seats == result['enroll_max'] - result['enroll_current'])
+
+ def parse_panel_description (panel_body):
+ result['course_description'] = panel_body.xpath('text()').extract()[0].strip()
+
+ def parse_panel_enrollment_reqs (panel_body):
+ result['enrollment_reqs'] = panel_body.xpath('text()').extract()[0].strip()
+
+ def parse_panel_class_notes (panel_body):
+ result['class_notes'] = panel_body.xpath('text()').extract()[0].strip()
+
+ def parse_panel_meeting_info (panel_body):
+ meet_info = panel_body.xpath('table')
+ meet_info = panel_body.xpath('tbody') or meet_info
+ meet_info = meet_info.xpath('tr[2]')
+ # print(meet_info.extract())
+ if meet_info:
+ result['meet_times'] = meet_info.xpath('td[1]/text()').extract()[0].strip()
+ result['location'] = meet_info.xpath('td[2]/text()').extract()[0].strip()
+ result['instructor'] = meet_info.xpath('td[3]/text()').extract()[0].strip()
+ result['class_dates'] = meet_info.xpath('td[4]/text()').extract()[0].strip()
+
+ def parse_panel_sections (panel_body):
+ pass
+
+ def parse_panel_combined_sections (panel_body):
+ pass
+
+ panels = content.xpath('div[contains(@class,"panel-group")]/div[contains(@class,"row")]')
+ for panel in panels:
+ header = panel.xpath('div[contains(@class,"panel-heading")]/h2/text()').extract()[0].strip()
+ body = panel.xpath('div[contains(@class,"panel-body")]')
+ try:
+ {
+ 'Class Details': parse_panel_class_details,
+ 'Description': parse_panel_description,
+ 'Enrollment Requirements': parse_panel_enrollment_reqs,
+ 'Class Notes': parse_panel_class_notes,
+ 'Meeting Information': parse_panel_meeting_info,
+ 'Combined Sections': parse_panel_combined_sections,
+ 'Associated Discussion Sections or Labs': parse_panel_sections,
+ }[header](body)
+ except KeyError:
+ raise Exception("Unhandled panel: '%s', with content:\n%s"%(header, body.extract()))
+
+ yield result
+ self.pages_done += 1
+ print("%d / %d"%(self.pages_done, self.pages_total))
diff --git a/crawlers/ucsc_old/ucsc/spiders/pisa_index_crawler.py b/crawlers/ucsc_old/ucsc/spiders/pisa_index_crawler.py
new file mode 100644
index 0000000..27b3146
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/spiders/pisa_index_crawler.py
@@ -0,0 +1,150 @@
+'''
+ This is a test for an API wrapper around scrapy.
+ It passes when the following code can run, and produce complete, correct output to the current crawler (pisa.py).
+'''
+from ucsc.architecture import BaseCrawler, item_producer, parser_entrypoint
+from ucsc.items import PisaIndexItem
+import scrapy
+from scrapy.spiders import CrawlSpider
+import datetime
+
+site_path = lambda path: '{}/{}'.format(
+ 'https://pisa.ucsc.edu/class_search', path)
+
+def to_datetime_time (hourly_time_string):
+ ''' Converts a time string generated by pisa into a python datetime.time value '''
+ hours, minutes, am_pm = re.match(r'(\d+):(\d+)(AM|PM)')
+ if am_pm == 'PM':
+ hours += 12
+ return datetime.time(hours, minutes)
+
+# TODO: unittests ^
+
+class PisaCourseIndexCrawler (BaseCrawler):
+ @parser_entrypoint
+ def parse (self, response):
+ response.xpath_require_one('body') \
+ .xpath_require_one('div[contains(@class,"center-block")]') \
+ .xpath_require_one('div[@class="panel-body"]') \
+ .xpath_require_many('div[contains(@id,"rowpanel")]') \
+ .map_async(self.parse_index_item)
+
+ @item_producer(PisaIndexItem)
+ def parse_index_item (self, response, result):
+ anchor = response \
+ .xpath_require_one('div[contains(@class,"panel-heading")]') \
+ .xpath_require_one('h2/a[contains(@id,"class_id_")]')
+
+ anchor.xpath_attrib('@href').bind(result, 'url')
+ anchor.xpath_stripped_text().bind_re(
+ r'\s*(\w+\s+\d+[A-Z]?)[^\d]+(\d+)[^\w]+([^\n]+)',
+ result,
+ ('course_name', 'course_section', 'course_title'))
+
+ content = response \
+ .xpath_require_one('div[contains(@class,"panel-body")]') \
+ .xpath_require_one('div[contains(@class,"row")]')
+
+ content.xpath_require_many('div[@class="col-xs-6 col-sm-3"]') \
+ .map_sequential_cases(check='maybe', cases=(
+ ('required',
+ lambda test:
+ test.xpath_stripped_text().equals("Class Number:") and \
+ test.xpath_attrib('a/@id').matches_re(r'class_nbr_\d+') and \
+ test.xpath_attrib('a/@href').matches_re(
+ r'https://pisa\.ucsc\.edu/class_search/index\.php\?action=detail&class_data=\w+'),
+ lambda value: value.xpath_stripped_text('a').to_int().bind(result, 'class_number')),
+
+ ('required',
+ lambda test:
+ test.xpath_require_one('i[1]').xpath_attrib('@class').contains('fa-user') and \
+ test.xpath_require_one('i[2]').xpath_attrib('@class').equals('sr-only') and \
+ test.xpath_require_one('i[2]').xpath_stripped_text().equals('Instructor:'),
+ lambda value: value.xpath_stripped_text().bind(result, 'instructor')),
+
+ ('required',
+ lambda test:
+ test.xpath_require_one('i[1]').xpath_attrib('@class').contains('fa-location-arrow') and \
+ test.xpath_require_one('i[2]').xpath_attrib('@class').equals('sr-only') and \
+ test.xpath_require_one('i[2]').xpath_stripped_text().equals('Day and Time:'),
+ lambda value: value.xpath_stripped_text().bind_re(
+ r'(LEC|DISC|LAB):\s+([\w\s]+)',
+ result,
+ ('class_type', 'location'))),
+
+ ('required',
+ lambda test:
+ test.xpath_require_one('i[1]').xpath_attrib('@class').contains('fa-clock-o') and \
+ test.xpath_require_one('i[2]').xpath_attrib('@class').equals('sr-only') and \
+ test.xpath_require_one('i[2]').xpath_stripped_text().equals('Location:'),
+ lambda value: value.xpath_stripped_text().bind_re(
+ r'([M(?:Tu)W(?:Tr)F]+)\s+(\d+:\d+(?:PM|AM))',
+ result,
+ ('meet_days', 'meet_begin', 'meet_end'),
+ (lambda days: days.replace('Tr','R').replace('Tu','T'), to_time, to_time))),
+
+ ('required',
+ lambda test: True,
+ # test.xpath_stripped_text().matches_re(r'\d+\s+of\d+\s+Enrolled'),
+ lambda value: value.xpath_stripped_text().bind_re_map(
+ r'(\d+)\s+of(\d+)\s+Enrolled',
+ result,
+ ('enroll_max', 'enroll_current'),
+ (int, int)))
+ ))
+
+ # Simpler, but lest robust version:
+ content.xpath_stripped_text('div[1]/a').to_int().bind(result, 'class_number')
+ content.xpath_stripped_text('div[2]').bind(result, 'instructor')
+ content.xpath_stripped_text('div[3]').bind(result, 'location')
+ content.xpath_stripped_text('div[4]').bind(result, 'meet_times')
+ content.xpath_stripped_text('div[5]').bind_re_map(
+ r'\s*(\d+)\s+of\s+(\d+)',
+ ('enroll_current','enroll_max'),
+ (int,int)
+ )
+ content.xpath_attrib('div[6]/a/@href').bind(result, 'materials_url')
+ response.request_async_crawl(
+ crawler=PisaCoursePageCrawler,
+ url=result['url'])
+
+
+class PisaCoursePageCrawler (BaseCrawler):
+ def parse (self, request):
+ pass
+
+
+
+class pisa_index_crawler (scrapy.Spider):
+ name = 'pisa_index_crawler'
+ allowed_domains = ['pisa.ucsc.edu']
+ search_url = site_path('index.php')
+ start_urls = [ search_url ]
+
+ def __init__ (self, *args, **kwargs):
+ super(pisa_index_crawler, self).__init__(*args, **kwargs)
+ self.my_crawler = PisaCourseIndexCrawler()
+
+ def parse(self, response):
+ yield scrapy.FormRequest(url=self.search_url,
+ formdata={'action':'results',
+ 'binds[:term]':'2188',
+ 'binds[:reg_status]':'all',
+ 'binds[:subject]':'',
+ 'binds[:catalog_nbr_op]':'=''',
+ 'binds[:catalog_nbr]':'',
+ 'binds[:title]':'',
+ 'binds[:instr_name_op]':'=''',
+ 'binds[:instructor]':'',
+ 'binds[:ge]':'',
+ 'binds[:crse_units_op]':'=''',
+ 'binds[:crse_units_from]':'',
+ 'binds[:crse_units_to]':'',
+ 'binds[:crse_units_exact]':'',
+ 'binds[:days]':'',
+ 'binds[:times]':'',
+ 'binds[:acad_career]':'',
+ 'binds[:session_code]':'',
+ 'rec_start': '0',
+ 'rec_dur': '1582'},
+ callback=self.my_crawler.parse)
diff --git a/crawlers/ucsc_old/ucsc/spiders/registrar_courses.py b/crawlers/ucsc_old/ucsc/spiders/registrar_courses.py
new file mode 100644
index 0000000..bd80d00
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/spiders/registrar_courses.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import os
+from ucsc.items import FacultyItem, ProgramStatementItem, CourseDescriptionItem
+
+
+def path_components (path):
+ if '://' in path:
+ path = path.split('://')[1]
+ parts = path.split('/')
+ while parts and parts[0] == '':
+ parts = parts[1:]
+ while parts and parts[-1] == '':
+ parts = parts[:-1]
+ return parts
+
+assert(path_components('') == [])
+assert(path_components('/') == [])
+assert(path_components('foo/') == ['foo'])
+assert(path_components('/bar') == ['bar'])
+assert(path_components('foo/bar') == ['foo','bar'])
+
+def merge_url (url, rel):
+ # note: blame seiji for all the issues with this code
+ thing = url.split('://')[0] if '://' in url else 'https'
+ if url and url[-1] == '/':
+ url = path_components(url)
+ else:
+ url = path_components(url)[:-1]
+
+ for part in path_components(rel):
+ if part == '..':
+ url = url[:-1]
+ else:
+ url.append(part)
+ return thing + '://' + '/'.join(url)
+
+assert(merge_url('https://registrar.ucsc.edu/catalog/programs-courses/index.html',
+ '../foo/bar/../baz.html') == 'https://registrar.ucsc.edu/catalog/foo/baz.html')
+assert(merge_url('', 'bar.baz') == 'https://bar.baz')
+assert(merge_url('https://foo/bar/baz.html', '') == 'https://foo/bar')
+
+registrar_base_url = 'https://registrar.ucsc.edu/catalog/programs-courses'
+base_course_description_url = 'https://registrar.ucsc.edu/catalog/programs-courses/course-descriptions'
+base_faculty_url = 'https://registrar.ucsc.edu/catalog/programs-courses/faculty'
+base_program_description_url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements'
+
+class RegistrarCoursesSpider(scrapy.Spider):
+ name = 'registrar_courses'
+ allowed_domains = ['registrar.ucsc.edu']
+ start_urls = [merge_url(registrar_base_url, 'index.html')]
+
+ def __init__(self, *args, **kwargs):
+ super(RegistrarCoursesSpider, self).__init__(*args, **kwargs)
+ self.crawled = set()
+
+ def parse (self, response):
+ print("Parsing %s"%response.url)
+
+ if base_course_description_url in response.url:
+ yield self.parse_course_info(response)
+ elif base_faculty_url in response.url:
+ yield self.parse_faculty_info(response)
+ elif base_program_description_url in response.url:
+ yield self.parse_program_info(response)
+
+ all_links = response.xpath('//a')
+ for link in all_links:
+ #print("Got link: %s"%link.extract())
+ try:
+ href = link.xpath('@href').extract()[0]
+
+ def is_local_url (url):
+ for thing in ('http:','https:','C:','www','ucsc.edu'):
+ if thing in url:
+ return False
+ return True
+
+ url = merge_url(response.url, href) if is_local_url(href) else href
+ if url in self.crawled:
+ continue
+ #print("Got URL: %s"%url)
+ self.crawled.add(url)
+ if registrar_base_url in url:
+ yield { 'url': url }
+ yield scrapy.Request(url, self.parse)
+ else:
+ pass
+ #print("Skipping %s"%url)
+ except IndexError:
+ pass
+
+ def parse_course_info (self, response):
+ info = CourseDescriptionItem()
+ info['url'] = response.url
+ print("Got %s"%response.url)
+ return info
+
+ def parse_faculty_info (self, response):
+ info = FacultyItem()
+ info['url'] = response.url
+ print("Got %s"%response.url)
+ return info
+
+ def parse_program_info (self, response):
+ info = ProgramStatementItem()
+ info['url'] = response.url
+ print("Got %s"%response.url)
+ return info
+
+
+
+class Unused:
+ def parse(self, response):
+ # Get links to all course pages from the registrar
+ page_content = response\
+ .xpath('body/div[@id="wrap"]/div[@id="container"]/div[@id="content"]')\
+ .xpath('div[@id="sprflt"]/div[@id="main"]/div[contains(@class,"content")]')
+ panel_elems = page_content.xpath('table/tbody/tr/td')
+
+ self.depts = {}
+ self.crawled = set()
+ for panel in panel_elems:
+ program_statements = panel.xpath('p/a')
+ for a in program_statements:
+ # print(a.xpath('@href').extract())
+ dept = a.xpath('@href').re(r'program-statements/(\w+)\.html')[0]
+ title = a.xpath('text()').extract()[0]
+ url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements/%s.html'%dept
+ self.depts[dept] = title
+ self.crawled.add(url)
+ yield scrapy.Request(url, callback=self.parse_program_info)
+ #course_url = 'https://registrar.ucsc.edu/catalog/programs-courses/course-descriptions/%s.html'%dept
+ program_url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements/%s.html'%dept
+ faculty_url = 'https://registrar.ucsc.edu/catalog/programs-courses/faculty/%s.html'%dept
+ #yield scrapy.Request(course_url, callback=self.parse_course_info)
+ yield scrapy.Request(program_url, callback=self.parse_program_info)
+ yield scrapy.Request(faculty_url, callback=self.parse_faculty_info)
+
+ def parse_program_info (self, response):
+ page_content = response\
+ .xpath('body/div[@id="wrap"]/div[@id="container"]/div[@id="content"]')\
+ .xpath('div[@id="sprflt"]/div[@id="main"]/div[contains(@class,"content")]')
+
+ page_links = page_content.xpath('p[3]/a')
+ for a in page_links:
+ href, regex = a.xpath('@href'), r'\.\./([\w\-]+/\w+\.html)'
+ try:
+ page = href.re(regex)[0]
+ title = a.xpath('text()').extract()[0]
+ url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements/%s'%page
+ print("\n%s: %s"%(url, title))
+ except IndexError:
+ print("Could not match '%s' with '%s'"%(href, regex))
+ content = page_content
+ #print("%s"%content.extract()[0])
+
+ def parse_course_info (self, response):
+ print("Got %s"%response.url)
+
+ def parse_faculty_info (self, response):
+ print("Got %s"%response.url)
diff --git a/crawlers/ucsc_old/ucsc/spiders/rmp_ucsc.py b/crawlers/ucsc_old/ucsc/spiders/rmp_ucsc.py
new file mode 100644
index 0000000..9268646
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/spiders/rmp_ucsc.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class RmpUcscSpider(scrapy.Spider):
+ name = 'rmp-ucsc'
+ allowed_domains = ['http://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=University+of+California+Santa+Cruz&schoolID=1078&queryoption=TEACHER']
+ start_urls = ['http://http://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=University+of+California+Santa+Cruz&schoolID=1078&queryoption=TEACHER/']
+
+ def parse(self, response):
+ pass
diff --git a/crawlers/ucsc_old/ucsc/spiders/ucsc_registrar.py b/crawlers/ucsc_old/ucsc/spiders/ucsc_registrar.py
new file mode 100644
index 0000000..4abae9e
--- /dev/null
+++ b/crawlers/ucsc_old/ucsc/spiders/ucsc_registrar.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class UcscRegistrarSpider(scrapy.Spider):
+ name = 'ucsc-registrar'
+ allowed_domains = ['https://registrar.ucsc.edu/catalog/programs-courses/']
+ start_urls = ['http://https://registrar.ucsc.edu/catalog/programs-courses//']
+
+ def parse(self, response):
+ pass
diff --git a/crawlers/ucsd/ucsd_crawler.py b/crawlers/ucsd/ucsd_crawler.py
new file mode 100644
index 0000000..16b98fc
--- /dev/null
+++ b/crawlers/ucsd/ucsd_crawler.py
@@ -0,0 +1,298 @@
+from bs4 import BeautifulSoup
+from urllib2 import urlopen
+from pprint import pprint
+import re
+
+def fetch_html (url, process_callback):
+ response = urlopen(url)
+ return process_callback(BeautifulSoup(response.read(), 'html.parser'))
+
+def enforce (condition, msg, *args):
+ if not condition:
+ raise Exception(msg % args)
+
+def get_catalog_course_pages (base_url):
+ index_url = '%s/front/courses.html'%base_url
+ def process (soup):
+ courses = {}
+ href_regex = re.compile(r'\.\./(courses/([^\.]+)\.html)')
+ for a in soup.find_all('a'):
+ if 'href' in a.attrs and 'title' in a.attrs:
+ match = re.match(href_regex, a.attrs['href'])
+ if match:
+ url, dept = match.group(1, 2)
+ url = '%s/%s'%(base_url, url)
+ title = a.attrs['title']
+ courses[dept] = { 'url': url, 'title': title }
+ return courses
+ return fetch_html(index_url, process)
+
+dept_set = set()
+
+def get_page_courses (dept, item, output):
+ dept_lower = dept.lower()
+ course_regex = re.compile(r'([a-z]+)(\d+[a-z]?)')
+
+ def getSiblingTextUntilNextAnchor (a):
+ text = ''
+ x = a.next
+ while x and x.name != 'a':
+ try:
+ text += x
+ except TypeError:
+ pass
+ x = x.next
+ return text
+
+ def process_course (name, title, descr):
+ if not name or len(name) == 0:
+ enforce(not title and not descr, "Empty name '%s' for '%s', '%s'", name, title, descr)
+ return None
+ hits = descr.split("Prerequisites:")
+ prereqs = ". ".join(hits[1:]).strip().strip('.')
+ descr = hits[0]
+ prereq_requirements = set()
+ def requirement (*reqs):
+ def sub (stuff):
+ # for req in reqs:
+ # prereq_requirements.add(req)
+ return ''
+ return sub
+
+ def course_case_multiple_and (match):
+ print("AND case:")
+ print(match.group(1, 2, 3))
+
+ def course_case_single (match):
+ print("SINGLE CASE: '%s' '%s'"%match.group(1, 2))
+
+ def course_case_concatenative_or (match):
+ print("OR CONCATENATIVE CASE: '%s' '%s'"%match.group(1, 2))
+
+ def course_case_concatenative_and (match):
+ print("AND CONCATENATIVE CASE: '%s' '%s'"%match.group(1, 2))
+
+ def parse_annoying_edge_case (match):
+ dept, course, suffixes = match.group(1, 2, 3)
+ match = re.match(r'(\d+)([A-Z\-]+)', course)
+ enforce(match, "Course invalid - something broke...? dept = '%s', course = '%s', suffixes = '%s'",
+ dept, course, suffixes)
+ prefix, suffix = match.group(1, 2)
+ suffixes = suffixes.strip().split()
+ print("PARSED ANNOYING EDGE CASE: dept, prefix = '%s', '%s'; suffixes = '%s', %s"%(
+ dept, prefix, suffix, suffixes))
+
+ def parse_fucking_ridiculous_everything_case (match):
+ # print("GOT RIDICULOUS CASE: '%s' '%s'"%(match.group(1), match.group(2)))
+ initial_string = match.group(0)
+ dept, courses = match.group(1, 2)
+ courses = re.sub(r'(and|or|[,;\-/])', ' ', courses).strip().split()
+ def splitCourseNumber (course):
+ match = re.match(r'(\d*)([A-Z]*)', course)
+ enforce(match, "Invalid course number: '%s' (for dept '%s', iniital string '%s'",
+ course, dept, initial_string)
+ return match.group(1, 2)
+
+ dept_set.add(dept)
+ if not re.match(r'[A-Z]{2,}', dept):
+ try:
+ dept = {
+ 'Calculus': 'MATH',
+ 'Chem': 'CHEM',
+ 'Chemistry': 'CHEM',
+ 'Cog Sci': 'COGS',
+ 'Cognitive Science': 'COGS',
+ 'Economics': 'ECON',
+ 'Econ': 'ECON',
+ 'Enrollment Special Studies Courses': 'ESSC',
+ 'Hum': 'HUM',
+ 'Math': 'MATH',
+ 'Math Level': 'Math Level',
+ 'Mathematics': 'MATH',
+ 'Neurology': 'NEU',
+ 'Neurosci': 'NEU',
+ 'Neurosciences': 'NEU',
+ 'Pharm': 'PHARM',
+ 'Philosophy': 'PHIL',
+ 'Phys': 'PHYS',
+ 'Physics': 'PHYS',
+ 'Poli Sci': 'POLI',
+ 'Psyc': 'PSYC',
+ 'Psych': 'PSYC',
+ 'Psychology': 'PSYC',
+ 'Science': '??? Science',
+ 'Special Studies Courses': 'SSC',
+ 'G': 'G ???'
+ }[dept]
+ except KeyError:
+ enforce(False, "Unrecognized department '%s'", dept)
+ prevNumber = None
+ dept += ' '
+ for course in courses:
+ n, a = splitCourseNumber(course)
+ if n:
+ prevNumber = n
+ else:
+ n = prevNumber
+ prereq_requirements.add(dept + n + a)
+
+
+ replace_cases = [
+ (r'none', ''),
+ (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+((\d+[A-Z\-/]*)(\s+(and|or)\s+[A-Z])+)(?:\s+|$)', parse_annoying_edge_case),
+ # (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+(\d+[A-Z\-]*)', course_case_single),
+ # (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+(\d+[A-Z\-]*(?:\s+or\s+\d+[A-Z\-]*)*)', course_case_concatenative_or),
+ # (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+(\d+[A-Z\-]*(?:\s+and\s+\d+[A-Z\-]*)*)', course_case_concatenative_and),
+ (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+((?:\d+[A-Z\-/]*(?:,\s+|,?\s+(?:or|and)\s+))*\d+[A-Z\-/]*)', parse_fucking_ridiculous_everything_case),
+ # (r'([A-Z]\w+) ((\d\w+), )or (\d+\w+)', course_case_multiple_or),
+
+ (r'[Ll]imited to BMS graduate students except by consent of instructor', requirement("GRADUATE_STANDING", "BMS_STUDENTS_ONLY", "INSTRUCTOR_APPROVAL")),
+ (r'[Ll]imited to senior undergraduates, graduate students, and medical students', requirement('GRADUATE_STANDING', 'SENIOR_STANDING', 'MEDICAL_STUDENT')),
+ (r'in bioengineering', requirement('BIOENGINEERING_MAJOR')),
+ (r'in bioengineering', requirement('SOCIOLOGY_MAJOR')),
+ (r'biological sciences', requirement("GRADUATE_STANDING", "BIOLOGICAL_SCIENCES_MAJOR")),
+ (r'standard undergraduate biology courses', requirement('BICD 1', 'BICD 2', 'BICD 3', 'BICD 4')),
+ (r'admission to Skaggs School of Pharmacy and Pharmaceutical Sciences or BMS Program \(major Code BS75\)', requirement('ADMITTED_SKAGGS_SCHOOL', 'BMS_STUDENT')),
+ (r'MAS( program| students?)?', requirement('ADMITTED_MAS_CLINICAL_RESEARCH_PROGRAM')),
+
+ (r'completion of college writing', requirement('COMPLETED_COLLEGE_WRITING')),
+ (r'admission to the MAS Clinical Research Program', requirement("ADMITTED_MAS_CLINICAL_RESEARCH_PROGRAM")),
+ (r'admission to (the )?MFA theatre program', requirement("ADMITTED_MFA_THEATRE_PROGRAM")),
+
+ (r'PhD', requirement('PHD_STANDING', 'GRADUATE_STANDING')),
+ (r'(for )?graduate( students?)?( standing| status)?( required)?', requirement('GRADUATE_STANDING')),
+ (r'([Uu]ndergraduates must be )?seniors?( standing)?( required)?', requirement('SENIOR_STANDING')),
+ (r'upper.division standing( required)?', requirement('UPPER_DIVISION_STANDING')),
+ (r'lower.division standing( required)?', requirement('LOWER_DIVISION_STANDING')),
+ (r'first.(year?)', requirement('REQUIRES_FIRST_YEAR_STUDENT')),
+ (r'second.(year?)', requirement('REQUIRES_SECOND_YEAR_STUDENT')),
+ (r'third.year', requirement('REQUIRES_THIRD_YEAR_STUDENT')),
+ (r'transfer standing( required)?', requirement('TRANSFER_STANDING')),
+ (r'for transfer students?', requirement('FOR_TRANSFER_STUDENTS')),
+
+ (r'AuD student', requirement("AUD_MAJOR")),
+ (r'Economics ', requirement("ECONOMICS_MAJOR")),
+ (r'Rady', requirement("RADY_MAJOR")),
+ (r'admission to PhD program in theatre', requirement("ADMITTED_PHD_THEATRE_PROGRAM")),
+ (r'design students?( only)?', requirement("DESIGN_MAJOR")),
+ (r'psychology majors?( only)?', requirement("PSYCHOLOGY_MAJOR")),
+ (r'GPS student?( only)?', requirement("GPS_MAJOR")),
+
+ (r'Sixth College (students?)?( only)?', requirement("SIXTH_COLLEGE")),
+ (r'Revelle College', requirement("REVELLE_COLLEGE")),
+
+ (r'(consent of (the ))?[Dd]epartment(al)? (stamp|approval|chair)?( required)?', requirement('DEPARTMENT_APPROVAL')),
+ (r'(consent of )?[Ii]nstruct(or)?( approval)?', requirement('INSTRUCTOR_APPROVAL')),
+ (r'(program approval)', requirement('PROGRAM_APPROVAL')),
+ (r'((status or )?consent of graduate program director)', requirement('REQUIRES_GRADUATE_PROGRAM_DIRECTOR_APPROVAL')),
+
+ (r'(by |through )?audition( required)?', requirement('REQUIRES_AUDITION')),
+ (r'(upper.division or graduate courses in molecular and cell biology)', requirement('UPPER_DIV_OR_GRADUATE_MCB_COURSES')),
+ (r'Restricted to students within the DS25 major', requirement("REQUIRES_DS25_MAJOR")),
+ (r'All other students will be allowed as space permits', requirement("OTHER_STUDENTS_ALLOWABLE_AS_SPACE_PERMITS")),
+ (r'enrollment in Science Studies Program', requirement("ENROLLED_IN_SCIENCES_STUDY_PROGRAM")),
+ (r'Bioengineering or Bioengineering: Biotechnology majors only', requirement("BIOENGINEERING_OR_BIOTECH_MAJORS_ONLY")),
+ (r'by invitation only', requirement("BY_INVITATION_ONLY")),
+ (r'MDE students only', requirement("MDE_STUDENTS_ONLY")),
+ (r'(with a )?grade of [A-Z]+.?( or better)?(, or equivalent)?',''),
+ (r'(or enrolled in|the department|or equivalent|(successful )?completion of)', ''),
+ (r'(in music)', ''),
+ (r'[Ee]nrollment (restricted to|by completion of prerequisites or by)', ''),
+ (r'\(S/U grades? (permitted|(option )?only)\.\)', ''),
+ (r'\([FWS](,[FWS])*\)', ''),
+ (r'^\s*((and|or|for|[,;\.\(\)])\s*)+$', ''),
+ ]
+ if prereqs:
+ original = prereqs
+ for r, s in replace_cases:
+ prereqs = re.sub(r, s, prereqs).strip()
+ # if prereqs:
+ # print(original)
+ # print("\t'%s'"%prereqs)
+ return { 'name': name, 'dept': name.split()[0], 'title': title, 'description': descr, 'prereqs': list(prereq_requirements) }
+
+ def process (soup):
+ num_courses = 0
+ for a in soup.find_all('a'):
+ try:
+ match = re.match(course_regex, a.attrs['id'])
+ if not match:
+ continue
+ text = getSiblingTextUntilNextAnchor(a).strip()
+ # print(text)
+ if '\n' in text:
+ items = text.split('\n')
+ header = items[0].strip()
+ descrip = items[1].strip()
+ # descrip = '\n'.join(items[1:]).strip()
+ else:
+ header, descrip = text.strip(), ''
+ # print(header)
+ if '.' in header:
+ items = header.split('.')
+ name = items[0].strip()
+ rest = '.'.join(items[1:]).strip()
+ else:
+ name, rest = header, ''
+ course = process_course(name, rest, descrip)
+ if course:
+ num_courses += 1
+ output['courses'][course['name']] = course
+ except KeyError:
+ continue
+ print("%d / %d: Parsed '%s': %s courses"%(
+ item['item_index'] + 1, item['total_items'], item['url'], num_courses))
+ return fetch_html(item['url'], process)
+
+def do_work (x):
+ get_page_courses(x['work_item']['dept'], x['work_item'], x)
+ return x['courses']
+
+def fetch_ucsd_courses (
+ base_url='http://ucsd.edu/catalog',
+ out_file=None,
+ parallelism=16,
+ return_results=True,
+):
+ print("Fetching course pages...")
+ course_pages = get_catalog_course_pages(base_url)
+ print("Got %d pages from %s"%(len(course_pages), base_url))
+
+ for i, (k, x) in enumerate(course_pages.iteritems()):
+ course_pages[k]['item_index'] = i
+ course_pages[k]['total_items'] = len(course_pages)
+ course_pages[k]['dept'] = k
+
+ output = { 'courses': {} }
+ if parallelism > 1:
+ from multiprocessing import Pool
+ pool = Pool(parallelism)
+ items = [ { 'courses': {}, 'work_item': item } for k, item in course_pages.iteritems() ]
+ courses = pool.map(do_work, items)
+ for result in courses:
+ output['courses'].update(result)
+ else:
+ for k, x in course_pages.iteritems():
+ get_page_courses(k, x, output)
+
+ if out_file:
+ import json
+ with open(out_file, 'w') as f:
+ json.dump(output, f)
+ print("Wrote %d courses to '%s'"%(len(output['courses']), out_file))
+
+ if return_results:
+ return output
+
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description='Fetches course data from the UCSD course catalog')
+ parser.add_argument('-o', '--out', type=str, help='output file', nargs='?', default='ucsd_courses.json')
+ parser.add_argument('-n', '--parallel', type=int, nargs='?', default=16)
+ args = parser.parse_args()
+
+ fetch_ucsd_courses(
+ base_url = 'http://ucsd.edu/catalog',
+ out_file = args.out,
+ parallelism = args.parallel)
diff --git a/crawlers/ucsd/ucsd_graph_gen.py b/crawlers/ucsd/ucsd_graph_gen.py
new file mode 100644
index 0000000..793c573
--- /dev/null
+++ b/crawlers/ucsd/ucsd_graph_gen.py
@@ -0,0 +1,122 @@
+from ucsd_crawler import fetch_ucsd_courses
+import json
+
+def generate_graph_data (courses, limit = -1):
+ edges = []
+ nodes = []
+ lookup_table = {}
+
+ def insert_entity (name, info):
+ id = lookup_table[name] = len(nodes)
+ nodes.append({
+ 'id': len(nodes),
+ 'label': name,
+ 'title': info['title'] if 'title' in info else '',
+ 'dept': info['dept'] if 'dept' in info else name.strip().split()[0],
+ 'description': info['description'] if 'description' in info else '',
+ 'edges_from': set(),
+ 'edges_to': set()
+ })
+
+ def lookup (name, info = {}):
+ if name not in lookup_table:
+ insert_entity(name, info)
+ return lookup_table[name]
+
+ for course, info in courses.iteritems():
+ if limit >= 0:
+ if limit == 0:
+ break
+ limit -= 1
+ self = lookup(course, info)
+ for node in map(lookup, info['prereqs']):
+ edges += [{ 'from': node, 'to': self }]
+ nodes[self]['edges_from'].add(node)
+ nodes[node]['edges_to'].add(self)
+
+ for i, _ in enumerate(nodes):
+ nodes[i]['edges_from'] = list(nodes[i]['edges_from'])
+ nodes[i]['edges_to'] = list(nodes[i]['edges_to'])
+ return { 'edges': edges, 'nodes': nodes }
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description='Generates vizjs graph data from the ucsd course catalog')
+ parser.add_argument('-i', '--input', type=str, help='input file', nargs='?', default='ucsd_courses.json')
+ parser.add_argument('-o', '--out', type=str, help='output file', nargs='?', default='ucsd_graph_data.json')
+ parser.add_argument('-r', '--rebuild', default=False, action='store_true')
+ parser.add_argument('-l', '--limit', type=int, default=-1)
+ parser.add_argument('-n', '--parallel', type=int, nargs='?', default=16)
+ parser.add_argument('--indent', type=int, nargs='?', default=0)
+ parser.add_argument('--sort_keys', type=bool, nargs='?', default=True)
+ parser.add_argument('-p', '--_print', default=False, action='store_true')
+ args = parser.parse_args()
+
+ if args.rebuild:
+ from subprocess import call
+ call(['python', 'ucsd_crawler.py', '--out', str(args.input), '--parallel', str(args.parallel)])
+
+ if args.rebuild:
+ content = fetch_ucsd_courses(
+ out_file = args.input,
+ return_results = True,
+ parallelism = args.parallel
+ )
+ else:
+ with open(args.input, 'r') as f:
+ content = json.loads(f.read())
+ # print(len(content['courses']))
+ courses = content['courses']
+
+ with open(args.out, 'w') as f:
+ graph_data = generate_graph_data(courses, limit=args.limit)
+ data = {
+ 'course_info': {
+ 'ucsd': {
+ 'courses': content['courses'],
+ 'vizjs': graph_data
+ }
+ }
+ }
+ # print(len(data))
+ # print(len(data['nodes']))
+ # print(len(data['edges']))
+ # print(len(data['data']))
+ if args.indent:
+ json.dump(data, f, indent=args.indent, sort_keys=args.sort_keys)
+ else:
+ json.dump(data, f, sort_keys=args.sort_keys)
+ if args._print:
+ if args.indent:
+ print(json.dumps(data, indent=args.indent, sort_keys=args.sort_keys))
+ else:
+ print(json.dumps(data, sort_keys=args.sort_keys))
+
+ missing_references = {}
+ resolved_references = {}
+ for course, info in sorted(courses.iteritems(), key = lambda (k,v): k):
+ for name in info['prereqs']:
+ if name not in courses:
+ if name not in missing_references:
+ missing_references[name] = { 'count': 1, 'refby': set(), 'name': name }
+ else:
+ missing_references[name]['count'] += 1
+ missing_references[name]['refby'].add(course)
+ else:
+ if name not in resolved_references:
+ resolved_references[name] = courses[name]
+ courses[name]['count'] = 1
+ courses[name]['refby'] = set()
+ else:
+ resolved_references[name]['count'] += 1
+ resolved_references[name]['refby'].add(course)
+ # print("%s resolved references"%(len(resolved_references)))
+ # for k, v in sorted(resolved_references.iteritems(), key = lambda (k, v): k):
+ # print("\t%s (%s references): %s"%(k, v['count'], ', '.join(v['refby'])))
+
+ # print("\n%s missing references"%(len(missing_references)))
+ # for k, v in sorted(missing_references.iteritems(), key = lambda (k, v): k):
+ # print("\t%s (%s references): %s"%(k, v['count'], ', '.join(v['refby'])))
+
+
+
diff --git a/jest.config.js b/jest.config.js
new file mode 100644
index 0000000..d6f14a5
--- /dev/null
+++ b/jest.config.js
@@ -0,0 +1,16 @@
+const TEST_REGEX = '(/__tests__/.*|(\\.|/)(test|spec))\\.(jsx?|js?|tsx?|ts?)$';
+
+module.exports = {
+ setupFiles: ['
/jest.setup.js'],
+ testRegex: TEST_REGEX,
+ transform: {
+ '^.+\\.jsx?$': 'babel-jest',
+ },
+ testPathIgnorePatterns: [
+ '/.next/', '/node_modules/',
+ ],
+ moduleFileExtensions: [
+ 'ts', 'tsx', 'js', 'jsx',
+ ],
+ collectCoverage: true,
+};
diff --git a/jest.setup.js b/jest.setup.js
new file mode 100644
index 0000000..4887d3f
--- /dev/null
+++ b/jest.setup.js
@@ -0,0 +1,4 @@
+const Enzyme = require('enzyme');
+const Adapter = require('enzyme-adapter-react-16');
+
+Enzyme.configure({adapter: new Adapter()});
diff --git a/next.config.js b/next.config.js
new file mode 100644
index 0000000..583ad48
--- /dev/null
+++ b/next.config.js
@@ -0,0 +1,10 @@
+module.exports = {
+ webpack: (config) => {
+ // Fixes npm packages that depend on `fs` module
+ config.node = {
+ fs: 'empty',
+ };
+
+ return config;
+ },
+};
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..daab75d
--- /dev/null
+++ b/package.json
@@ -0,0 +1,67 @@
+{
+ "name": "course-graph",
+ "scripts": {
+ "dev": "node server/index.js",
+ "pretest": "eslint . --ext js --ext jsx",
+ "test": "jest",
+ "test:ci": "jest --coverage --coverageReporters=text-lcov | coveralls",
+ "build": "next build",
+ "start": "NODE_ENV=production node server/index.js",
+ "docs": "jsdoc -c conf.json -d docs -R README.md components"
+ },
+ "dependencies": {
+ "@material-ui/core": "^1.4.0",
+ "@material-ui/icons": "^2.0.0",
+ "algoliasearch": "^3.29.0",
+ "bcrypt-nodejs": "0.0.3",
+ "body-parser": "latest",
+ "compression": "^1.7.2",
+ "connect-mongo": "^2.0.1",
+ "cors": "^2.8.4",
+ "crypto": "^1.0.1",
+ "express": "^4.16.3",
+ "express-flash": "0.0.2",
+ "express-session": "^1.15.6",
+ "express-validator": "^5.2.0",
+ "isomorphic-unfetch": "^2.0.0",
+ "jss": "^9.8.7",
+ "lru-cache": "^4.1.3",
+ "mongoose": "^5.2.1",
+ "next": "latest",
+ "nprogress": "^0.2.0",
+ "passport": "^0.4.0",
+ "passport-local": "^1.0.0",
+ "prop-types": "^15.6.2",
+ "qs": "^6.5.2",
+ "react": "^16.4.1",
+ "react-dom": "^16.4.1",
+ "react-draggable": "^3.0.5",
+ "react-graph-vis": "^1.0.2",
+ "react-instantsearch": "^5.2.2",
+ "react-jss": "^8.6.1",
+ "react-particles-js": "^2.2.0",
+ "reactjs-popup": "^1.1.1",
+ "styled-jsx": "^2.2.7"
+ },
+ "devDependencies": {
+ "@pixi/jsdoc-template": "^2.4.2",
+ "babel-core": "7.0.0-bridge.0",
+ "babel-eslint": "^8.2.5",
+ "babel-jest": "^23.2.0",
+ "coveralls": "^3.0.2",
+ "enzyme": "^3.3.0",
+ "enzyme-adapter-react-16": "^1.1.1",
+ "eslint": "^5.0.1",
+ "eslint-config-fbjs": "^2.0.1",
+ "eslint-plugin-babel": "^5.1.0",
+ "eslint-plugin-flowtype": "^2.49.3",
+ "eslint-plugin-jsx-a11y": "^6.1.0",
+ "eslint-plugin-react": "^7.10.0",
+ "eslint-plugin-relay": "0.0.24",
+ "jest": "^23.3.0",
+ "jsdoc": "^3.5.5",
+ "morgan": "^1.9.0",
+ "react-addons-test-utils": "^15.6.2",
+ "react-test-renderer": "^16.4.1"
+ }
+}
diff --git a/pages/_app.jsx b/pages/_app.jsx
new file mode 100644
index 0000000..9ba9fa0
--- /dev/null
+++ b/pages/_app.jsx
@@ -0,0 +1,50 @@
+import React from 'react';
+import App, { Container } from 'next/app';
+import { MuiThemeProvider } from '@material-ui/core/styles';
+import CssBaseline from '@material-ui/core/CssBaseline';
+import JssProvider from 'react-jss/lib/JssProvider';
+import getPageContext from '../utils/getPageContext';
+
+class MyApp extends App {
+ constructor(props) {
+ super(props);
+ this.pageContext = getPageContext();
+ }
+
+ pageContext = null;
+
+ componentDidMount() {
+ // Remove the server-side injected CSS.
+ const jssStyles = document.querySelector('#jss-server-side');
+ if (jssStyles && jssStyles.parentNode) {
+ jssStyles.parentNode.removeChild(jssStyles);
+ }
+ }
+
+ render() {
+ const {Component, pageProps} = this.props;
+ return (
+
+ {/* Wrap every page in Jss and Theme providers */}
+
+ {/* MuiThemeProvider makes the theme available down the React
+ tree thanks to React context. */}
+
+
+ {/* Pass pageContext to the _document though the renderPage enhancer
+ to render collected styles on server side. */}
+
+
+
+
+ );
+ }
+}
+
+export default MyApp;
diff --git a/pages/_document.jsx b/pages/_document.jsx
new file mode 100644
index 0000000..2beb9ca
--- /dev/null
+++ b/pages/_document.jsx
@@ -0,0 +1,94 @@
+import React from 'react';
+import PropTypes from 'prop-types';
+import Document, { Head, Main, NextScript } from 'next/document';
+import flush from 'styled-jsx/server';
+
+class MyDocument extends Document {
+ render() {
+ const {pageContext} = this.props;
+
+ return (
+
+
+ Course Graph
+
+ {/* Use minimum-scale=1 to enable GPU rasterization */}
+
+ {/* PWA primary color */}
+
+
+
+
+
+
+
+
+ );
+ }
+}
+
+MyDocument.getInitialProps = ctx => {
+ // Resolution order
+ //
+ // On the server:
+ // 1. app.getInitialProps
+ // 2. page.getInitialProps
+ // 3. document.getInitialProps
+ // 4. app.render
+ // 5. page.render
+ // 6. document.render
+ //
+ // On the server with error:
+ // 1. document.getInitialProps
+ // 2. app.render
+ // 3. page.render
+ // 4. document.render
+ //
+ // On the client
+ // 1. app.getInitialProps
+ // 2. page.getInitialProps
+ // 3. app.render
+ // 4. page.render
+
+ let pageContext;
+ const page = ctx.renderPage(Component => {
+ const WrappedComponent = props => {
+ pageContext = props.pageContext;
+ return ;
+ };
+
+ WrappedComponent.propTypes = {
+ pageContext: PropTypes.object.isRequired,
+ };
+
+ return WrappedComponent;
+ });
+
+ return {
+ ...page,
+ pageContext,
+ // Styles fragment is rendered after the app and page rendering finish.
+ styles: (
+
+
+ {flush() || null}
+
+ ),
+ };
+};
+
+export default MyDocument;
diff --git a/pages/_error.jsx b/pages/_error.jsx
new file mode 100644
index 0000000..0db9c08
--- /dev/null
+++ b/pages/_error.jsx
@@ -0,0 +1,18 @@
+import React from 'react';
+
+export default class Error extends React.Component {
+ static getInitialProps({res, err}) {
+ const statusCode = res ? res.statusCode : err ? err.statusCode : null;
+ return {statusCode};
+ }
+
+ render() {
+ return (
+
+ {this.props.statusCode
+ ? `An error ${this.props.statusCode} occurred on server`
+ : 'An error occurred on client'}
+
+ );
+ }
+}
diff --git a/pages/account/login.jsx b/pages/account/login.jsx
new file mode 100644
index 0000000..d09bdd3
--- /dev/null
+++ b/pages/account/login.jsx
@@ -0,0 +1,10 @@
+import React from 'react';
+import Login from '../../components/login/Login';
+import Header from '../../components/Header';
+
+export default () => (
+
+
+
+
+);
diff --git a/pages/account/signup.jsx b/pages/account/signup.jsx
new file mode 100644
index 0000000..2feddf9
--- /dev/null
+++ b/pages/account/signup.jsx
@@ -0,0 +1,88 @@
+import React, { Component } from 'react';
+
+import Header from '../../components/Header';
+import TextField from '@material-ui/core/TextField';
+import Button from '@material-ui/core/Button';
+
+import fetch from 'isomorphic-unfetch';
+
+export default class Login extends Component {
+ state = {
+ email: 'email',
+ password: '',
+ confirmPassword: '',
+ };
+
+ handleChange = (email, password, confirmPassword) => (event) => {
+ this.setState({
+ [email]: event.target.value,
+ [password]: event.target.value,
+ [confirmPassword]: event.target.value,
+ });
+ };
+
+ /**
+ * @param event
+ * @return {Promise}
+ */
+ handleSubmit = async (event) => {
+ event.preventDefault();
+
+ let data = {
+ email: this.state.email,
+ password: this.state.password,
+ confirmPassword: this.state.confirmPassword,
+ };
+
+ console.log(data);
+
+ // https://coursegraph.org/account/login
+ await fetch('http://localhost:8080/account/signup', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(data),
+ });
+ };
+
+ render() {
+ return (
+
+
+
+
+ );
+ }
+}
diff --git a/pages/index.jsx b/pages/index.jsx
new file mode 100644
index 0000000..b380401
--- /dev/null
+++ b/pages/index.jsx
@@ -0,0 +1,71 @@
+import React from 'react';
+import PropTypes from 'prop-types';
+import Particles from 'react-particles-js';
+import { withStyles } from '@material-ui/core/styles';
+
+import Header from '../components/Header';
+import HomePanel from '../components/home/HomePanel';
+
+/**
+ * Define the style of components on this page
+ * @param theme
+ * @return {object}
+ */
+const styles = theme => ({
+ wrapper: {
+ 'text-align': 'center',
+ },
+});
+
+/**
+ * Define the 'particle-js' setting on the background
+ * @type {object}
+ */
+const particleParams = {
+ particles: {
+ 'number': {
+ 'value': 40,
+ 'density': {
+ 'enable': true,
+ 'value_area': 800,
+ },
+ },
+ 'color': {
+ 'value': '#ffffff',
+ },
+ 'opacity': {
+ 'value': 0.5,
+ 'random': false,
+ 'anim': {
+ 'enable': false,
+ 'speed': 1,
+ 'opacity_min': 0.1,
+ 'sync': false,
+ },
+ },
+ },
+};
+
+/**
+ * Home Page
+ */
+class IndexPage extends React.Component {
+ static propTypes = {
+ classes: PropTypes.object.isRequired,
+ };
+
+ render() {
+ const {classes} = this.props;
+
+ return (
+
+ );
+ }
+}
+
+export default withStyles(styles)(IndexPage);
diff --git a/pages/ucsc/index.jsx b/pages/ucsc/index.jsx
new file mode 100644
index 0000000..2ad8be8
--- /dev/null
+++ b/pages/ucsc/index.jsx
@@ -0,0 +1,80 @@
+import React from 'react';
+import PropTypes from 'prop-types';
+import Router from 'next/router';
+import qs from 'qs';
+
+import App from '../../components/Search';
+import { findResultsState } from '../../components/Instantsearch';
+import Header from '../../components/Header';
+
+/**
+ * @type {number}
+ */
+const updateAfter = 700;
+
+/**
+ * @param searchState
+ * @return {string} the url
+ */
+const searchStateToUrl = searchState =>
+ searchState ? `${window.location.pathname}?${qs.stringify(searchState)}` : '';
+
+/**
+ * Search Page
+ */
+export default class extends React.Component {
+ static propTypes = {
+ resultsState: PropTypes.object,
+ searchState: PropTypes.object,
+ };
+
+ onSearchStateChange = (searchState) => {
+ clearTimeout(this.debouncedSetState);
+ this.debouncedSetState = setTimeout(() => {
+ const href = searchStateToUrl(searchState);
+ Router.push(href, href, {
+ shallow: true,
+ });
+ }, updateAfter);
+ this.setState({searchState});
+ };
+
+ constructor() {
+ super();
+ this.onSearchStateChange = this.onSearchStateChange.bind(this);
+ }
+
+ static async getInitialProps(params) {
+ const searchState = qs.parse(
+ params.asPath.substring(params.asPath.indexOf('?') + 1)
+ );
+ const resultsState = await findResultsState(App, {searchState});
+
+ return {resultsState, searchState};
+ }
+
+ render() {
+ return (
+
+ );
+ }
+}
diff --git a/pages/ucsd/index.jsx b/pages/ucsd/index.jsx
new file mode 100644
index 0000000..17ad8ce
--- /dev/null
+++ b/pages/ucsd/index.jsx
@@ -0,0 +1,67 @@
+import React from 'react';
+import PropTypes from 'prop-types';
+import fetch from 'isomorphic-unfetch';
+
+import { withStyles } from '@material-ui/core/styles';
+
+import GraphViewAssembly from '../../components/graph/GraphViewAssembly';
+import Header from '../../components/Header';
+
+/**
+ * Define the style of components on this page
+ * @param theme
+ * @return {object}
+ */
+const styles = theme => ({
+ wrapper: {
+ 'text-align': 'center',
+ },
+ appBar: {
+ position: 'absolute',
+ transition: theme.transitions.create(['margin', 'width'], {
+ easing: theme.transitions.easing.sharp,
+ duration: theme.transitions.duration.leavingScreen,
+ }),
+ },
+});
+
+class GraphPage extends React.Component {
+ static propTypes = {
+ classes: PropTypes.object.isRequired,
+ graphData: PropTypes.object.isRequired,
+ };
+
+ /**
+ * @param req
+ * @param query
+ * @return {Promise<*>}
+ */
+ static async getInitialProps({req, query}) {
+ const isServer = !!req;
+
+ // const URL = 'http://localhost:8080/api/graph-data/ucsd';
+ // const URL = 'https://coursegraph.org/api/graph-data/ucsd';
+
+ if (isServer) {
+ return {graphData: query.itemData};
+ } else {
+ const res = await fetch('https://coursegraph.org/api/graph-data/ucsd');
+ const json = await res.json();
+ return {graphData: json};
+ }
+ }
+
+ render() {
+ const {classes, graphData} = this.props;
+
+ return (
+
+
+
+
+
+ );
+ }
+}
+
+export default withStyles(styles)(GraphPage);
diff --git a/server/config/passport.js b/server/config/passport.js
new file mode 100644
index 0000000..e7e002f
--- /dev/null
+++ b/server/config/passport.js
@@ -0,0 +1,52 @@
+const passport = require('passport');
+const {Strategy: LocalStrategy} = require('passport-local');
+
+const User = require('../models/user');
+
+passport.serializeUser((user, done) => {
+ done(null, user.id);
+});
+
+passport.deserializeUser((id, done) => {
+ User.findById(id, (err, user) => {
+ done(err, user);
+ });
+});
+
+/**
+ * Sign in using Email and Password.
+ */
+passport.use(new LocalStrategy({usernameField: 'email'},
+ (email, password, done) => {
+ User.findOne({email: email.toLowerCase()}, (err, user) => {
+ if (err) {
+ return done(err);
+ }
+
+ if (!user) {
+ return done(null, false, {msg: `Email ${email} not found.`});
+ }
+
+ return user.comparePassword(password, (err, isMatch) => {
+ if (err) {
+ return done(err);
+ }
+ if (isMatch) {
+ return done(null, user);
+ }
+ return done(null, false, {msg: 'Invalid email or password.'});
+ });
+ });
+ }));
+
+/**
+ * Login Required middleware.
+ */
+exports.isAuthenticated = (req, res, next) => {
+ if (req.isAuthenticated()) {
+ return next();
+ }
+
+ return res.redirect('/account/login');
+};
+
diff --git a/server/controllers/courses.js b/server/controllers/courses.js
new file mode 100644
index 0000000..be52281
--- /dev/null
+++ b/server/controllers/courses.js
@@ -0,0 +1,33 @@
+const UCSC = require('../models/ucsc_course');
+const UCSD = require('../models/ucsd_courses');
+
+/**
+ * @type {Map.}
+ */
+let schoolMap = new Map([
+ ['UCSC', UCSC],
+ ['UCSD', UCSD],
+]);
+
+/**
+ * GET / courses
+ */
+exports.getCourses = (req, res) => {
+ const schoolName = req.params.id || 'UCSD';
+ const school = schoolMap.get(schoolName.toUpperCase());
+
+ // console.log(`Asking for ${schoolName}...`);
+
+ if (school) {
+ school.find({}).lean().exec((err, course) => {
+ if (err) {
+ return console.error(err);
+ }
+ return res.json(course);
+ });
+ } else {
+ return res.json([]);
+ }
+};
+
+
diff --git a/server/controllers/home.js b/server/controllers/home.js
new file mode 100644
index 0000000..40a8eee
--- /dev/null
+++ b/server/controllers/home.js
@@ -0,0 +1,11 @@
+/**
+ * GET /
+ * Home page.
+ * @param app {App}
+ * @return {Function}
+ */
+exports.index = (app) => (req, res) => {
+ // check if logged in already,
+
+ res.redirect('/');
+};
diff --git a/server/controllers/user.js b/server/controllers/user.js
new file mode 100644
index 0000000..3174653
--- /dev/null
+++ b/server/controllers/user.js
@@ -0,0 +1,150 @@
+const {promisify} = require('util');
+const crypto = require('crypto');
+const passport = require('passport');
+
+const User = require('../models/user');
+
+const randomBytesAsync = promisify(crypto.randomBytes);
+
+/**
+ * GET /login
+ * Login page.
+ * @param app {App} Next app
+ * @return {Function}
+ */
+exports.getLogin = (app) => (req, res) => {
+ if (req.user) {
+ return res.redirect('/');
+ }
+
+ return app.render(req, res, '/account/login', {
+ title: 'Login',
+ });
+};
+
+/**
+ * POST /login
+ * Sign in using email and password.
+ * @param app {App} Next app
+ * @return {Function}
+ */
+exports.postLogin = (app) => (req, res, next) => {
+ req.assert('email', 'Email is not valid').isEmail();
+ req.assert('password', 'Password cannot be blank').notEmpty();
+ req.sanitize('email').normalizeEmail({gmail_remove_dots: false});
+
+ const errors = req.validationErrors();
+
+ if (errors) {
+ // req.flash('errors', errors);
+ return res.redirect('/account/error'); // login
+ }
+
+ console.log('ready');
+
+ passport.authenticate('local', {
+ successRedirect: '/',
+ failureRedirect: '/account/login'
+ }
+ );
+
+ console.log('go');
+
+ return res.redirect('/');
+
+ // return passport.authenticate('local', (err, user, info) => {
+ // if (err) {
+ // return next(err);
+ // }
+ //
+ // if (!user) {
+ // // req.flash('errors', info);
+ // return res.redirect('/account/error');
+ // }
+ //
+ // return req.logIn(user, (err) => {
+ // if (err) {
+ // return next(err);
+ // }
+ //
+ // // req.flash('success', {msg: 'Success! You are logged in.'});
+ // return res.redirect(req.session.returnTo || '/');
+ // });
+ // })(req, res, next);
+};
+
+/**
+ * GET /signup
+ * Signup page.
+ * @param app
+ * @return {Function}
+ */
+exports.getSignup = (app) => (req, res) => {
+ if (req.user) {
+ return res.redirect('/');
+ }
+
+ return app.render(req, res, '/account/signup', {
+ title: 'Create Account',
+ });
+};
+
+/**
+ * POST /signup
+ * Create a new local account.
+ * @param app
+ * @return {Function}
+ */
+exports.postSignup = (app) => (req, res, next) => {
+ req.assert('email', 'Email is not valid').isEmail();
+ req.assert('password', 'Password must be at least 4 characters long').len(4);
+ req.assert('confirmPassword', 'Passwords do not match').equals(
+ req.body.password);
+ req.sanitize('email').normalizeEmail({gmail_remove_dots: false});
+
+ const errors = req.validationErrors();
+
+ console.log(req.body);
+ console.log(errors);
+
+ if (errors) {
+ // req.flash('errors', errors);
+ return res.redirect('/account/error');
+ }
+
+ /**
+ * @type {Model}
+ */
+ const user = new User({
+ email: req.body.email,
+ password: req.body.password,
+ });
+
+ // Check if the user already exist
+ return User.findOne({email: req.body.email}, (err, existingUser) => {
+ if (err) {
+ return next(err);
+ }
+
+ if (existingUser) {
+ req.flash('errors', {
+ msg: 'Account with that email address already exists.',
+ });
+ return res.redirect('/account/error');
+ }
+
+ return user.save((err) => {
+ if (err) {
+ return next(err);
+ }
+
+ return req.logIn(user, (err) => {
+ if (err) {
+ return next(err);
+ }
+
+ return res.redirect('/');
+ });
+ });
+ });
+};
diff --git a/server/index.js b/server/index.js
new file mode 100644
index 0000000..bfcba7b
--- /dev/null
+++ b/server/index.js
@@ -0,0 +1,192 @@
+/**
+ * Module dependency
+ */
+const express = require('express');
+const next = require('next');
+const compression = require('compression');
+const session = require('express-session');
+const bodyParser = require('body-parser');
+const mongoose = require('mongoose');
+const passport = require('passport');
+const expressValidator = require('express-validator');
+const LRUCache = require('lru-cache');
+const logger = require('morgan');
+const flash = require('express-flash');
+const MongoStore = require('connect-mongo')(session);
+const cors = require('cors');
+
+/**
+ * Controllers
+ */
+// const homeController = require('./controllers/home');
+const courseController = require('./controllers/courses');
+const userController = require('./controllers/user');
+const api = require('./operations/get_graph_data');
+
+/**
+ * Constant Settings
+ */
+const PORT = parseInt(process.env.PORT, 10) || 8080;
+const dev = process.env.NODE_ENV !== 'production';
+
+const app = next({dev});
+const defaultRequestHandler = app.getRequestHandler();
+
+const LOCAL_DB = 'courses';
+const MONGODB_URI = process.env.MONGODB_URI || `mongodb://localhost:27017/${LOCAL_DB}`;
+
+/**
+ * API keys and Passport configuration.
+ */
+const passportConfig = require('./config/passport');
+
+/**
+ * This is where we cache our rendered HTML pages
+ * @type {LRUCache}
+ */
+const ssrCache = new LRUCache({
+ max: 100,
+ maxAge: 1000 * 60 * 60, // 1hour
+});
+
+app.prepare()
+ .then(() => {
+ /**
+ * Create Express server.
+ */
+ const server = express();
+
+ /**
+ * Express configuration.
+ */
+ server.use(bodyParser.json());
+ server.use(expressValidator());
+ server.use(compression());
+ server.use(logger('dev'));
+ server.use(session({
+ resave: true,
+ saveUninitialized: true,
+ secret: 'I LOVE CMPS115',
+ cookie: {maxAge: 1209600000}, // two weeks in milliseconds
+ store: new MongoStore({
+ url: MONGODB_URI,
+ autoReconnect: true,
+ }),
+ }));
+ server.use(passport.initialize());
+ server.use(flash());
+ server.use(cors());
+
+ /**
+ * Connect to MongoDB.
+ */
+ mongoose.Promise = Promise;
+ mongoose.connect(MONGODB_URI, {useNewUrlParser: true});
+
+ const db = mongoose.connection;
+
+ db.on('error', console.error.bind(console, 'connection error:'));
+ server.use((req, res, next) => {
+ // Expose the MongoDB database handle so Next.js can access it.
+ req.db = db;
+ next();
+ });
+
+ /**
+ * Primary app routes.
+ */
+ server.get('/', (req, res) => {
+ renderAndCache(req, res, '/');
+ });
+
+ server.get('/account/login', userController.getLogin(app));
+ server.post('/account/login', userController.postLogin(app));
+ server.get('/account/signup', userController.getSignup(app));
+ server.post('/account/signup', userController.postSignup(app));
+
+ server.get('/foo', passportConfig.isAuthenticated, (req, res) => {
+ res.send('hello world');
+ });
+
+ server.get('/ucsc', (req, res) => {
+ renderAndCache(req, res, '/ucsc');
+ });
+
+ server.get('/ucsd', (req, res) => {
+ const itemData = api.getGraphData();
+ // renderAndCache(req, res, '/ucsd', {itemData: itemData});
+ app.render(req, res, '/ucsd', {itemData: itemData});
+ });
+
+ /**
+ * API routes.
+ */
+ server.get('/api/courses/:id', courseController.getCourses);
+ server.get('/api/graph-data/:school', (req, res) => {
+ const itemData = api.getGraphData(req.params.school);
+ res.json(itemData);
+ });
+
+ /**
+ * Fall-back on other next.js assets.
+ */
+ server.get('*', (req, res) => {
+ return defaultRequestHandler(req, res);
+ });
+
+ /**
+ * Start Express server.
+ */
+ server.listen(PORT, (err) => {
+ if (err) {
+ throw err;
+ }
+ console.log(`> Ready on http://localhost:${PORT}`);
+ });
+ }).catch(error => console.error(error.stack));
+
+
+/**
+ * @param req
+ * @return {string}
+ */
+function getCacheKey(req) {
+ return `${req.url}`;
+}
+
+/**
+ * @param req
+ * @param res
+ * @param pagePath
+ * @param queryParams
+ * @return {Promise}
+ */
+async function renderAndCache(req, res, pagePath, queryParams) {
+ const key = getCacheKey(req);
+
+ // If we have a page in the cache, let's serve it
+ if (ssrCache.has(key)) {
+ res.setHeader('x-cache', 'HIT');
+ res.send(ssrCache.get(key));
+ return;
+ }
+
+ try {
+ // If not let's render the page into HTML
+ const html = await app.renderToHTML(req, res, pagePath, queryParams);
+
+ // Something is wrong with the request, let's skip the cache
+ if (res.statusCode !== 200) {
+ res.send(html);
+ return;
+ }
+
+ // Let's cache this page
+ ssrCache.set(key, html);
+
+ res.setHeader('x-cache', 'MISS');
+ res.send(html);
+ } catch (err) {
+ app.renderError(err, req, res, pagePath, queryParams);
+ }
+}
diff --git a/server/models/ucsc_course.js b/server/models/ucsc_course.js
new file mode 100644
index 0000000..0d468af
--- /dev/null
+++ b/server/models/ucsc_course.js
@@ -0,0 +1,17 @@
+const mongoose = require('mongoose');
+
+const UCSCCourseSchema = new mongoose.Schema({
+ description: {type: String},
+ division: {type: String},
+ geCategories: {type: String},
+ instructor: {type: String},
+ name: {type: String},
+ terms: {type: String},
+ title: {type: String},
+}, {
+ collection: 'ucsc',
+});
+
+const Course = mongoose.model('UCSCCourse', UCSCCourseSchema);
+
+module.exports = Course;
diff --git a/server/models/ucsd_courses.js b/server/models/ucsd_courses.js
new file mode 100644
index 0000000..9e4ecf9
--- /dev/null
+++ b/server/models/ucsd_courses.js
@@ -0,0 +1,15 @@
+const mongoose = require('mongoose');
+
+const UCSDCourseSchema = new mongoose.Schema({
+ dept: String,
+ description: String,
+ name: String,
+ prereqs: [String],
+ title: String,
+}, {
+ collection: 'ucsd',
+});
+
+const Course = mongoose.model('UCSDCourseSchema', UCSDCourseSchema);
+
+module.exports = Course;
diff --git a/server/models/user.js b/server/models/user.js
new file mode 100644
index 0000000..291c127
--- /dev/null
+++ b/server/models/user.js
@@ -0,0 +1,49 @@
+const bcrypt = require('bcrypt-nodejs');
+const mongoose = require('mongoose');
+
+const userSchema = new mongoose.Schema({
+ email: {type: String, unique: true},
+ password: String,
+
+ profile: {
+ courses: Array,
+ },
+}, {timestamps: true});
+
+/**
+ * Password hash middleware.
+ */
+userSchema.pre('save', (next) => {
+ const user = this;
+ // if (!user.isModified('password')) {
+ // return next();
+ // }
+
+ return bcrypt.genSalt(10, (err, salt) => {
+ if (err) {
+ return next(err);
+ }
+
+ return bcrypt.hash(user.password, salt, null, (err, hash) => {
+ if (err) {
+ return next(err);
+ }
+
+ user.password = hash;
+ return next();
+ });
+ });
+});
+
+/**
+ * Helper method for validating user's password.
+ */
+userSchema.methods.comparePassword = (candidatePassword, cb) => {
+ bcrypt.compare(candidatePassword, this.password, (err, isMatch) => {
+ cb(err, isMatch);
+ });
+};
+
+const User = mongoose.model('User', userSchema);
+
+module.exports = User;
diff --git a/server/operations/count_graph_data.js b/server/operations/count_graph_data.js
new file mode 100644
index 0000000..e50c935
--- /dev/null
+++ b/server/operations/count_graph_data.js
@@ -0,0 +1,33 @@
+const fs = require('fs');
+
+// {
+// "dept": "SOCD",
+// "description": "",
+// "edges_from": [],
+// "edges_to": [
+// 74
+// ],
+// "id": 75,
+// "label": "SOCD 158",
+// "title": ""
+// },
+
+// let count = 0;
+let data = JSON.parse(fs.readFileSync('../../data/ucsd_graph_data.json', 'utf8'));
+
+let deptSet = new Set();
+
+for (const obj of data.nodes) {
+ if (obj.dept) {
+ deptSet.add(obj.dept);
+ }
+}
+
+let text = '[\n';
+for (const dept of deptSet) {
+ text += `"${dept}",\n`;
+}
+text += ']';
+
+console.log(`Saved ${deptSet.size}`);
+console.log(text);
diff --git a/server/operations/gen_actual_data.js b/server/operations/gen_actual_data.js
new file mode 100644
index 0000000..c088f46
--- /dev/null
+++ b/server/operations/gen_actual_data.js
@@ -0,0 +1,59 @@
+const fs = require('fs');
+const mongoose = require('mongoose');
+
+const UCSC = require('../models/ucsc_course');
+const UCSD = require('../models/ucsd_courses');
+
+/**
+ * "course_info": {
+ "ucsd": {
+ "courses": {
+ * @param data
+ */
+function parse_ucsd_courses(data) {
+ const courses = Object.values(data.course_info.ucsd.courses);
+
+ let arrOfCourses = [];
+
+ for (const course of courses) {
+ arrOfCourses.push(course);
+ }
+
+ return arrOfCourses;
+}
+
+function gen() {
+
+ mongoose.connect('mongodb://localhost:27017/courses', {useNewUrlParser: true});
+ let db = mongoose.connection;
+
+ db.on('error', console.error.bind(console, 'connection error:'));
+ db.once('open', () => {
+ //ucsd
+ let count = 0;
+ let data = JSON.parse(fs.readFileSync('../../data/courses.json', 'utf8'));
+
+ data.forEach((obj) => {
+ let thing = new UCSC(obj);
+ count++;
+ thing.save();
+ });
+
+ console.log(`Saved ${count}`);
+
+ // ucsd
+ count = 0;
+ data = JSON.parse(fs.readFileSync('../../data/ucsd_all_data.json', 'utf8'));
+ data = parse_ucsd_courses(data);
+
+ data.forEach((obj) => {
+ let thing = new UCSD(obj);
+ count++;
+ thing.save();
+ });
+
+ console.log(`Saved ${count}`);
+ });
+}
+
+gen();
diff --git a/server/operations/get_course_db.js b/server/operations/get_course_db.js
new file mode 100644
index 0000000..f5892ca
--- /dev/null
+++ b/server/operations/get_course_db.js
@@ -0,0 +1,22 @@
+const Course = require('../models/ucsc_course');
+
+/**
+ * @return {Array.}
+ */
+function get() {
+ let arr = [];
+
+ Course.find({}).lean().exec((err, course) => {
+ if (err) {
+ console.error(err);
+ return;
+ }
+
+ arr.push(course);
+ });
+
+
+ return arr;
+}
+
+module.exports = get;
diff --git a/server/operations/get_graph_data.js b/server/operations/get_graph_data.js
new file mode 100644
index 0000000..b60f1ef
--- /dev/null
+++ b/server/operations/get_graph_data.js
@@ -0,0 +1,28 @@
+const fs = require('fs');
+const path = require('path');
+
+const data = fs.readFileSync(
+ path.join(__dirname, '../../data/ucsd_graph_data.json'), 'utf8');
+
+/**
+ * @type {Map.}
+ */
+const schoolMap = new Map([
+ ['UCSD', data],
+]);
+
+/**
+ * @param school {string}
+ * @return {object}
+ */
+function getGraphData(school = 'UCSD') {
+ const graphData = schoolMap.get(school.toUpperCase());
+
+ if (!graphData) {
+ return {};
+ }
+
+ return JSON.parse(graphData);
+}
+
+module.exports = {getGraphData};
diff --git a/server/operations/parse_graph_data.js b/server/operations/parse_graph_data.js
new file mode 100644
index 0000000..6859542
--- /dev/null
+++ b/server/operations/parse_graph_data.js
@@ -0,0 +1,20 @@
+const fs = require('fs');
+
+
+const data = JSON.parse(
+ fs.readFileSync('../../data/ucsd_all_data.json', 'utf8'));
+
+const vizjs = data.course_info.ucsd.vizjs;
+
+// for (const entry of Object.entries(courses)) {
+// let key = entry[0];
+// let value = entry[1];
+// }
+
+fs.writeFile('../../data/ucsd_graph_data.json', JSON.stringify(vizjs), (err) => {
+ if (err) {
+ throw err;
+ }
+ console.log('The file has been saved!');
+});
+
diff --git a/server/operations/parse_raw_data.js b/server/operations/parse_raw_data.js
new file mode 100644
index 0000000..2b0eeb7
--- /dev/null
+++ b/server/operations/parse_raw_data.js
@@ -0,0 +1,67 @@
+const fs = require('fs');
+
+const data = JSON.parse(fs.readFileSync('../../data/data.json', 'utf8'));
+
+const courseTemplate = Object.keys({
+ description: '',
+ division: '',
+ geCategories: '',
+ instructor: '',
+ name: '',
+ terms: '',
+ title: '',
+});
+
+let arr = [];
+
+/**
+ * @param obj
+ * @return {boolean}
+ */
+function check(obj) {
+ courseTemplate.forEach((template) => {
+ if (!obj.hasOwnProperty(template)) {
+ if (typeof obj[template] !== 'string') {
+ return false;
+ }
+ }
+ });
+
+ return true;
+}
+
+function parseCourse(data) {
+ let courses = Object.values(data);
+ for (const obj of courses) {
+ if (check) {
+ arr.push(obj);
+ } else {
+ console.log('Something wrong with the data: ');
+ console.log(data);
+ }
+ }
+}
+
+let count = 0;
+for (const pair of Object.entries(data)) {
+ let key = pair[0];
+ let courses = pair[1].courses;
+
+ count++;
+ if (courses) {
+ console.log(`${key}: ${Object.keys(courses).length}`);
+
+ parseCourse(courses);
+ } else {
+ console.log(`${key}`);
+ }
+}
+console.log(count);
+
+fs.writeFile('../../data/courses.json', JSON.stringify(arr), (err) => {
+ if (err) {
+ throw err;
+ }
+ console.log('The file has been saved!');
+});
+
diff --git a/static/favicon.ico b/static/favicon.ico
new file mode 100644
index 0000000..3e264cd
Binary files /dev/null and b/static/favicon.ico differ
diff --git a/static/instantsearch.css b/static/instantsearch.css
new file mode 100644
index 0000000..fa77f2f
--- /dev/null
+++ b/static/instantsearch.css
@@ -0,0 +1,53 @@
+.ais-InstantSearch__root {
+ align-items: center;
+}
+
+header {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+}
+
+content {
+ display: flex;
+ margin: 25px 0;
+}
+
+menu {
+ flex: 2;
+}
+
+footer {
+ text-align: center;
+}
+
+.ais-Pagination {
+ margin-bottom: 25px;
+}
+
+results {
+ flex: 10;
+}
+
+.hit {
+ display: flex;
+ align-items: center;
+}
+
+.hit-actions {
+ display: flex;
+}
+
+.hit-content {
+ padding: 0px 10px;
+}
+
+.hit-picture img {
+ width: 80px;
+ height: 80px;
+}
+
+.hit-type {
+ color: #888888;
+ font-size: 13px;
+}
diff --git a/static/nprogress.css b/static/nprogress.css
new file mode 100644
index 0000000..29f9d1f
--- /dev/null
+++ b/static/nprogress.css
@@ -0,0 +1,82 @@
+/* Make clicks pass-through */
+#nprogress {
+ pointer-events: none;
+}
+
+#nprogress .bar {
+ background: #29d;
+
+ position: fixed;
+ z-index: 1031;
+ top: 0;
+ left: 0;
+
+ width: 100%;
+ height: 2px;
+}
+
+/* Fancy blur effect */
+#nprogress .peg {
+ display: block;
+ position: absolute;
+ right: 0px;
+ width: 100px;
+ height: 100%;
+ box-shadow: 0 0 10px #29d, 0 0 5px #29d;
+ opacity: 1.0;
+
+ -webkit-transform: rotate(3deg) translate(0px, -4px);
+ -ms-transform: rotate(3deg) translate(0px, -4px);
+ transform: rotate(3deg) translate(0px, -4px);
+}
+
+/* Remove these to get rid of the spinner */
+#nprogress .spinner {
+ display: block;
+ position: fixed;
+ z-index: 1031;
+ top: 15px;
+ right: 15px;
+}
+
+#nprogress .spinner-icon {
+ width: 18px;
+ height: 18px;
+ box-sizing: border-box;
+
+ border: solid 2px transparent;
+ border-top-color: #29d;
+ border-left-color: #29d;
+ border-radius: 50%;
+
+ -webkit-animation: nprogress-spinner 400ms linear infinite;
+ animation: nprogress-spinner 400ms linear infinite;
+}
+
+.nprogress-custom-parent {
+ overflow: hidden;
+ position: relative;
+}
+
+.nprogress-custom-parent #nprogress .spinner,
+.nprogress-custom-parent #nprogress .bar {
+ position: absolute;
+}
+
+@-webkit-keyframes nprogress-spinner {
+ 0% {
+ -webkit-transform: rotate(0deg);
+ }
+ 100% {
+ -webkit-transform: rotate(360deg);
+ }
+}
+
+@keyframes nprogress-spinner {
+ 0% {
+ transform: rotate(0deg);
+ }
+ 100% {
+ transform: rotate(360deg);
+ }
+}
diff --git a/utils/getPageContext.js b/utils/getPageContext.js
new file mode 100644
index 0000000..0852635
--- /dev/null
+++ b/utils/getPageContext.js
@@ -0,0 +1,53 @@
+/* eslint-disable no-underscore-dangle */
+
+import { SheetsRegistry } from 'jss';
+import {
+ createGenerateClassName,
+ createMuiTheme,
+} from '@material-ui/core/styles';
+import purple from '@material-ui/core/colors/purple';
+import green from '@material-ui/core/colors/green';
+
+// A theme with custom primary and secondary color.
+// It's optional.
+const theme = createMuiTheme({
+ palette: {
+ primary: {
+ light: purple[300],
+ main: purple[500],
+ dark: purple[700],
+ },
+ secondary: {
+ light: green[300],
+ main: green[500],
+ dark: green[700],
+ },
+ },
+});
+
+function createPageContext() {
+ return {
+ theme,
+ // This is needed in order to deduplicate the injection of CSS in the page.
+ sheetsManager: new Map(),
+ // This is needed in order to inject the critical CSS.
+ sheetsRegistry: new SheetsRegistry(),
+ // The standard class name generator.
+ generateClassName: createGenerateClassName(),
+ };
+}
+
+export default function getPageContext() {
+ // Make sure to create a new context for every server-side request so that da
+ // isn't shared between connections (which would be bad).
+ if (!process.browser) {
+ return createPageContext();
+ }
+
+ // Reuse context on the client-side.
+ if (!global.__INIT_MATERIAL_UI__) {
+ global.__INIT_MATERIAL_UI__ = createPageContext();
+ }
+
+ return global.__INIT_MATERIAL_UI__;
+}