diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..78a3710 --- /dev/null +++ b/.babelrc @@ -0,0 +1,13 @@ +{ + "env": { + "development": { + "presets": ["next/babel"] + }, + "production": { + "presets": ["next/babel"] + }, + "test": { + "presets": [["next/babel", {"preset-env": {"modules": "commonjs"}}]] + } + } +} diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 0000000..f8513c9 --- /dev/null +++ b/.eslintignore @@ -0,0 +1,13 @@ +node_modules/ +.next/ +.idea/ +coverage/ +static/ +spider/ +data/ +assets/ +crawlers/ +docs/ +*.json +*.md +LICENSE diff --git a/.eslintrc b/.eslintrc new file mode 100644 index 0000000..ce46692 --- /dev/null +++ b/.eslintrc @@ -0,0 +1,26 @@ +{ + "extends": [ + "eslint:recommended", + "plugin:react/recommended", + "fbjs" + ], + "env": { + "browser": true, + "commonjs": true, + "node": true, + "es6": true + }, + "parserOptions": { + "ecmaVersion": 8, + "sourceType": "module" + }, + "parser": "babel-eslint", + "rules": { + "no-console": "off", + "strict": [ + "error", + "global" + ], + "curly": "warn" + } +} diff --git a/.gitignore b/.gitignore index ad46b30..9157bff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,61 +1,32 @@ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* - -# Runtime data -pids -*.pid -*.seed -*.pid.lock - -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov - -# Coverage directory used by tools like istanbul -coverage - -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components +# See https://help.github.com/ignore-files/ for more about ignoring files. -# node-waf configuration -.lock-wscript +# dependencies +/node_modules/ -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release +# testing +/coverage/ -# Dependency directories -node_modules/ -jspm_packages/ +# production +/build/ +/dist/ +/.next/ -# TypeScript v1 declaration files -typings/ - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz +# misc +.DS_Store +.env +npm-debug.log* +yarn-debug.log* +yarn-error.log* -# Yarn Integrity file -.yarn-integrity +/.idea/ +package-lock.json -# dotenv environment variables file -.env +*.pyc +/.coveralls.yml -# next.js build output -.next +/crawlers/ucsc/data/* +/crawlers/ucsc/prereqs +/crawlers/ucsc/unparsed +crawlers/ucsc/temp +crawlers/ucsd/ucsd_courses.json +crawlers/ucsd/ucsd_graph_data.json diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..25a2671 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: node_js +node_js: + - "8" + - "9" + - "10" +install: + - npm install +script: + - npm run pretest + - npm run test:ci diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8b2a639 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Course Graph + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 3e05d28..36852d7 100644 --- a/README.md +++ b/README.md @@ -1 +1,152 @@ -# CourseGraph \ No newline at end of file +

+ +![coursegraph](./assets/logo.png) + +[![Build Status](https://travis-ci.org/coursegraph/CourseGraph.svg?branch=development)](https://travis-ci.org/coursegraph/CourseGraph) +[![Coverage Status](https://coveralls.io/repos/github/coursegraph/CourseGraph/badge.svg)](https://coveralls.io/github/coursegraph/CourseGraph) +[![tested with jest](https://img.shields.io/badge/tested_with-jest-99424f.svg)](https://github.com/facebook/jest) +[![jest](https://jestjs.io/img/jest-badge.svg)](https://github.com/facebook/jest) +[![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/coursegraph/CourseGraph/blob/master/LICENSE.md) + +

+ +## Introduction + +The course search software offered by UCSC (and most colleges) kind of sucks, and there is no easy way to explore classes and majors without a counselor – and the counselors could probably use some help too! + +Solution? CourseGraph, a webapp that will: + ++ datamine pisa and the registrar for course information and sections (and major / minor requirements if we can do that) ++ display this information visually as a web of interdependent courses and major requirements, filterable and presented through different layers and views + +Technology: we will need + + + a web frontend (probably React, vis.js, material-ui) and people interested in UX and software design (myself included) + + a web backend (probably node, mongoDB) and people interested in backend development and data storage / retrieval + + several web crawlers to datamine UCSC sites and maybe others; anyone interested in this please apply! ++ possible integration of other web services (if we could embed eg. ratemyprofessors that would be awesome) + +Is this feasible in <5 weeks? + + + Yes, but it will be challenging as we'll have a lot of work to do + + Plus side is we all get to wear lots of hats and use a lot of cool tech to build a real tool that students and counselors can use to explore class options and make planning schedules a lot easier + + This project can be subdivided with 2-3 teams working in parallel on different components (eg. frontend and data mining), so we should be able to work without too many bottlenecks + +You do NOT need to have experience with react, node, or Vis to join this project, just a good attitude and a willingness to learn and contribute. + +That said, you will need time to learn a bit of typescript and either frontend (react, vis.js), backend (node, databases – ask Ivan), or data mining (web crawlers, either node or python), since we'll probably be splitting into sub-teams that focus on one of those categories. And you'll need to do this fairly quickly (ie. over the next few weeks) since we'll need to hit the ground running as soon as possible. Oh, and if you'd like to do project management (as one of your many hats) that would be very useful too. + +I'll be learning react and vis.js over the next week or so, so if you're interested in that (whether you're a part of this team or not) please hit me up! (ssemery@ucsc.edu) + +## Getting Started + +These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. (To developers) See deployment for notes on how to deploy the project on a live system. + +### Prerequisites + +[Node.js](https://nodejs.org/en/) - JavaScript runtime built on Chrome's V8 JavaScript engine. +[mongoDB] (https://docs.mongodb.com/manual/installation/) - MongoDB is an open-source document database that provides high performance, high availability, and automatic scaling. + +The minimum supported Node version is `v6.0.0` by default. (We are using `v10.0.0`). + +``` +node --version + +// v10.0.0 +``` + +### Installing + +Clone our repository (or unzip the project and cd into the root folder), + +``` +git clone https://github.com/coursegraph/CourseGraph myProject +cd myProject +``` + +And install dependencies, via [`npm`](https://www.npmjs.com/) (installed with Node.js). + +``` +npm install +``` + +### Running the project + +Running the project is as simple as running + +``` +npm run dev +``` + +This runs the `dev` script specified in our `package.json`, and will spawn off a server which reloads the page as we save our files. Typically the server runs at `http://localhost:3000`, but should be automatically opened for you. + +## Running the tests + +Testing is also just a command away: + +``` +npm run test +``` + +This command runs [`jest`](http://jestjs.io/) and [`enzyme`](http://airbnb.io/enzyme/), an incredibly useful testing utility. + +## Built With + +* [Next.js](https://nextjs.org/) - A lightweight framework for static and server‑rendered applications. +* [React](https://reactjs.org) - A JavaScript library for building user interfaces +* [Node.js](https://nodejs.org/en/) - A JavaScript runtime built on Chrome's V8 JavaScript engine. +* [MongoDB](https://www.mongodb.com/) - Build innovative modern applications that create a competitive advantage. + +## Dependencies +* [Material-ui/core] +* [material-ui/icons] +* [algoliasearch] +* [bcrypt-nodejs] +* [body-parser] +* [compression] +* [connect-mongo' +* [crypto] +* [express] +* [express-flash] +* [express-session] +* [express-validator] +* [isomorphic-unfetch] +* [jss] +* [lru-cache] +* [mongoose] +* [next] +* [npgrogress] +* [passport] +* [passport-local] +* [prop-types] +* [qs] +* [react] +* [react-dom] +* [react-draggable] +* [react-graph-vis] +* [react-instantsearc] +* [react-jss] +* [reactjs-popup] +* [styled-jsx] + + +## Authors + +* **Seiji Emery** ([SeijiEmery](https://github.com/SeijiEmery) ) - Lead Tech Developer +* **Yanwen Xu** ([RaiderSoap](https://github.com/RaiderSoap) ) - :floppy_disk: Back-End Developer +* **Patrick Lauderdale** ([ThePatrickLauderdale](https://github.com/ThePatrickLauderdale)) - FrontEnd Developer +* **Wendy Liang** ([wendyrliang](https://github.com/wendyrliang) ) - FrontEnd Developer +* **Ka Ho Tran** ([Kutaho](https://github.com/Kutaho) ) - FrontEnd Developer +* **Nikki Miller** ([NikMills](https;//github.com/nikmills)) - FrontEnd Developer + +See also the list of [contributors](https://github.com/coursegraph/CourseGraph/settings/collaboration) who participated in this project. + +## License + +This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details + +## Acknowledgments + +Big thanks to Richard Jullig. + +:kissing_heart: \ No newline at end of file diff --git a/assets/logo.png b/assets/logo.png new file mode 100644 index 0000000..6106a5d Binary files /dev/null and b/assets/logo.png differ diff --git a/components/Course.jsx b/components/Course.jsx new file mode 100644 index 0000000..4d6683e --- /dev/null +++ b/components/Course.jsx @@ -0,0 +1,30 @@ +import React from 'react'; +import PropTypes from 'prop-types'; + +/** + * @inheritDoc + */ +class Course extends React.Component { + static propTypes = { + title: PropTypes.string.isRequired, + }; + + static defaultProps = { + title: 'Null', + }; + + /** + * @return {Element} + */ + render() { + return ( +
+

+ {this.props.title} +

+
+ ); + } +} + +export default Course; diff --git a/components/Course.test.jsx b/components/Course.test.jsx new file mode 100644 index 0000000..d66913f --- /dev/null +++ b/components/Course.test.jsx @@ -0,0 +1,17 @@ +import React from 'react'; +import { shallow } from 'enzyme'; + +import Course from './Course'; + + +describe('A Course', () => { + it('should render default text without throwing an error', () => { + const wrapper = shallow(); + expect(wrapper.find('h3').text()).toEqual('Null'); + }); + + it('should render title props without throwing an error', () => { + const wrapper = shallow(); + expect(wrapper.find('h3').text()).toEqual('CMPS 101'); + }); +}); diff --git a/components/Header.jsx b/components/Header.jsx new file mode 100644 index 0000000..880c93e --- /dev/null +++ b/components/Header.jsx @@ -0,0 +1,34 @@ +import React from 'react'; +import NextHead from 'next/head'; +import NProgress from 'nprogress'; +import Router from 'next/router'; + +Router.onRouteChangeStart = (url) => { + NProgress.start(); +}; + +Router.onRouteChangeComplete = () => NProgress.done(); +Router.onRouteChangeError = () => NProgress.done(); + +/** + * A header Component that provide Progress bar + * @return {*} + */ +export default () => ( +
+ + + Course Graph + + + + + + + + +
+); diff --git a/components/Instantsearch.jsx b/components/Instantsearch.jsx new file mode 100644 index 0000000..d403c52 --- /dev/null +++ b/components/Instantsearch.jsx @@ -0,0 +1,5 @@ +import { createInstantSearch } from 'react-instantsearch/server'; + +const {InstantSearch, findResultsState} = createInstantSearch(); + +export { InstantSearch, findResultsState }; diff --git a/components/PopMenu.jsx b/components/PopMenu.jsx new file mode 100644 index 0000000..095c16e --- /dev/null +++ b/components/PopMenu.jsx @@ -0,0 +1,162 @@ +import React from 'react'; +import Popup from 'reactjs-popup'; +import List from '@material-ui/core/List'; +import ListItem from '@material-ui/core/ListItem'; +import ListItemText from '@material-ui/core/ListItemText'; +import Draggable from 'react-draggable'; + +/** + * Required Props: + * array: array of JSON objects, should have string children of: name, title, instructor etc + * filter: filtering function of your choice, should act on above mentioned array + */ + + +//style declaration +const lStyle = { + overflow: 'auto', + maxHeight: '400px', + width: '300px', +}; + +const pStyle = { + fontSize: '14px', + textAlign: 'left', + +}; + +const inStyle = { + width: '300px', + fontSize: '14px', +}; + +class CourseDetailsWindow extends React.Component { + render() { + const course = this.props.course; + return ( +
+ × +
+ {`${course.name} ${course.title}`} +
+
+

{`Instructor: ${course.instructor}`}

+

{`Terms: ${course.terms}`}

+

{`GE: ${course.geCategories}`}

+

{`Division: ${course.division}`}

+

{`Description: ${course.description}`}

+
+
+ ); + } +} + +class PopMenu extends React.Component { + constructor(props) { + super(props); + + this.handleClick = this.handleClick.bind(this); + this.handleOutsideClick = this.handleOutsideClick.bind(this); + + this.state = { + visibleElements: 15, + popupVisible: false, + }; + + } + + onListScroll = (event) => { + const el = document.getElementById('listDiv'); + const max = el.scrollHeight; + const scrolled = el.scrollTop; + let newVisibleElements = this.state.visibleElements + 15; + + if ((max - scrolled) < 410) { + this.setState({ + visibleElements: newVisibleElements, + }); + } + + //console.log(`max scroll height % : ${max}`); + //console.log(`amount scrolled? : ${scrolled}`); + }; + + handleFilterCall = (event) => { + this.props.filter(event); + this.setState({visibleElements: 15}); + }; + + handleClick() { + if (!this.state.popupVisible) { + document.addEventListener('click', this.handleOutsideClick, false); + } else { + document.removeEventListener('click', this.handleOutsideClick, false); + } + + this.setState(prevState => ({ + popupVisible: !prevState.popupVisible, + })); + } + + handleOutsideClick(e) { + if (this.node.contains(e.target)) { + return; + } + this.handleClick(); + } + + render() { + const data = this.props.array.slice(0, this.state.visibleElements); + let n = 0; + //console.log(`in Render, filtered: ${this.state.filtered}`); + + return ( + +
+ } + position="bottom left" + onOpen={this.handleClick} + //on="click" + //closeOnDocumentClick + mouseLeaveDelay={300} + mouseEnterDelay={0} + contentStyle={{padding: '0px', border: 'none'}} + arrow={false} + > + {this.state.popupVisible && ( +
{ + this.node = node; + }} style={lStyle} id="listDiv" onScroll={this.onListScroll}> + {data.map((course) => ( +
+ + + + } modal> + {close => + + } + +
+ ))} +
+
+ )} +
+
+
+ ); + } +} + +export default PopMenu; diff --git a/components/Popups.jsx b/components/Popups.jsx new file mode 100644 index 0000000..39937f5 --- /dev/null +++ b/components/Popups.jsx @@ -0,0 +1,89 @@ +import React from 'react'; +import Popup from 'reactjs-popup'; + +class CourseDetailsPanel extends React.Component { + render() { + const course = this.props.course; + return ( +
+

{'Instructor: '}{course.instructor}

+

{'Time: '}{course.time}

+

{'Location: '}{course.location}

+
+ ); + } +} + +class CourseDetailsWindow extends React.Component { + render() { + const course = this.props.course; + return ( +
+ × + + + +
{course.course_number}
+ +
+ ); + } +} + +class Popups extends React.Component { + constructor(props) { + super(props); + this.removeDefTags = this.removeDefTags.bind(this); + this.selectDefTag = this.selectDefTag.bind(this); + this.state = { + deftags: [] || props.tags, + }; + } + + static defaultProps = { + myLists: ({ + course_title: 'Software Engineering', + course_number: '123', + instructor: 'Richard', + time: 'MW 9:00-12:30AM', + location: 'LEC Annex 101', + }), + }; + + removeDefTags() { + if (this.state.deftags === 'N/A' + || this.state.deftags === 'In Progress' + || this.state.deftags === 'Finished') { + this.state.deftags = ''; + } + } + + selectDefTag(e) { + this.removeDefTags(); + if (e === 'N/A') { + this.state.deftags = 'N/A'; + } + if (e === 'In Progress') { + this.state.deftags = 'In Progress'; + } + if (e === 'Finished') { + this.state.deftags = 'Finished'; + } + } + + render() { + return
+ {this.props.myLists.course_title}} modal> + {close => + this.selectDefTag(tag)} /> + } + +
; + } +} + +export default Popups; \ No newline at end of file diff --git a/components/Search.jsx b/components/Search.jsx new file mode 100644 index 0000000..097738e --- /dev/null +++ b/components/Search.jsx @@ -0,0 +1,82 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import { Configure, Highlight, Hits, SearchBox } from 'react-instantsearch/dom'; + +import { InstantSearch } from './Instantsearch'; + +/** + * The display component of the result of a search. + * @param hit {object} The hit object contains properties of a Course. + * @param hit.name {String} Department + Number of the Course. E.g. "CMPS 101" + * @param hit.title {String} Title of the Course. + * E.g. "Algorithms and Abstract Data Types" + * @param hit.terms {String} Term of year offered. E.g. "F" for fall. + * @param hit.instructor {String} The name of the Instructor. E.g. "Van Gelder" + * @param hit.geCategories {String} The General Education requirement. E.g. "CC" + * @param hit.division {String} Indicates "upper-division" "lower-division" or + * "graduate". + * @return {Element} + * @constructor + */ +const HitComponent = ({hit}) => ( +
+
+ +
+ +
+ +
+
+); + +/** + * @type {{hit: shim}} + */ +HitComponent.propTypes = { + hit: PropTypes.object, +}; + +/** + * @inheritDoc + */ +class Search extends React.Component { + + /** + * @type {{searchState: shim, resultsState, onSearchStateChange: shim}} + */ + static propTypes = { + searchState: PropTypes.object, + resultsState: PropTypes.oneOfType([PropTypes.object, PropTypes.array]), + onSearchStateChange: PropTypes.func, + }; + + /** + * @return {Element} + */ + render() { + return ( + + +
+

UCSC Courses

+ +
+ + + + + +
+ ); + } +} + +export default Search; diff --git a/components/Tooltip.jsx b/components/Tooltip.jsx new file mode 100644 index 0000000..9047990 --- /dev/null +++ b/components/Tooltip.jsx @@ -0,0 +1,86 @@ +import React from 'react'; +import PropTypes from 'prop-types'; + +// The modal "window" +const modalStyle = { + backgroundColor: '#fff', + borderRadius: 5, + maxWidth: 500, + minHeight: 300, + margin: '0 auto', + padding: 30, +}; + +class Tooltip extends React.Component { + static propTypes = { + content: PropTypes.object, + trigger: PropTypes.object, + }; + + static defaultProps = { + content: '', + trigger: '', + }; + + constructor(props) { + super(props); + this.handleMouseOver = this.handleMouseOver.bind(this); + this.handleMouseOut = this.handleMouseOut.bind(this); + this.handleClick = this.handleClick.bind(this); + this.handleOutsideClick = this.handleOutsideClick.bind(this); + this.state = { + isOpen: false, + }; + } + + handleMouseOver = () => { + this.setState({isOpen: true}); + }; + + handleMouseOut = () => { + this.setState({isOpen: false}); + }; + + handleClick = () => { + if (!this.state.isOpen) { + document.addEventListener('click', this.handleOutsideClick, false); + } else { + document.removeEventListener('click', this.handleOutsideClick, false); + } + this.setState({ + isOpen: this.state.isOpen, + }); + }; + + handleOutsideClick = (e) => { + if (this.node.contains(e.target)) { + return; + } + this.handleClick(); + }; + + + render() { + return ( +
+
+ {this.props.trigger} +
+
+ {this.state.isOpen && +
+ {this.props.content} +
+ } +
+
+ ); + } +} + +export default Tooltip; \ No newline at end of file diff --git a/components/graph/CourseInfoCard.jsx b/components/graph/CourseInfoCard.jsx new file mode 100644 index 0000000..9fc67e0 --- /dev/null +++ b/components/graph/CourseInfoCard.jsx @@ -0,0 +1,60 @@ +import React from 'react'; +import PropTypes from 'prop-types'; + +import { withStyles } from '@material-ui/core/styles'; +import Card from '@material-ui/core/Card'; +import CardContent from '@material-ui/core/CardContent'; +import CardHeader from '@material-ui/core/CardHeader'; +import Typography from '@material-ui/core/Typography'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + panel: { + 'maxWidth': 350, + 'maxHeight': 600, + // 'position': 'absolute', + 'margin-left': 'auto', + 'margin-right': 'auto', + 'z-index': 100, + 'top': theme.spacing.unit * 25, + 'right': theme.spacing.unit * 10, + }, +}); + +/** + * @param classes {object} + * @param label {string} + * @param title {string} + * @param description {string} + * @return {Element} + * @constructor + */ +const CourseInfoCard = ({classes, label, title, description}) => { + return ( + + + + {description || 'Unavailable'} + + + ); +}; + +CourseInfoCard.propTypes = { + classes: PropTypes.object.isRequired, + label: PropTypes.string.isRequired, + title: PropTypes.string.isRequired, + description: PropTypes.string.isRequired, +}; + +CourseInfoCard.defaultProps = { + title: 'Untitled', + description: 'Unavailable', + label: '- --', +}; + +export default withStyles(styles)(CourseInfoCard); diff --git a/components/graph/GraphView.jsx b/components/graph/GraphView.jsx new file mode 100644 index 0000000..b191e79 --- /dev/null +++ b/components/graph/GraphView.jsx @@ -0,0 +1,149 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; +import Graph from 'react-graph-vis'; + +import { withStyles } from '@material-ui/core/styles'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + fullpage: { + position: 'absolute', + top: 0, + bottom: 0, + left: 0, + right: 0, + overflow: 'hidden', + 'z-index': -1, + }, +}); + +/** + * vis.js graph configuration setting + * @type {object} + */ +const options = { + groups: { + useDefaultGroups: true, + myGroupId: { + /*node options*/ + }, + }, + layout: { + randomSeed: 666, + hierarchical: { + enabled: false, + sortMethod: 'hubsize', + }, + improvedLayout: true, + }, + edges: { + color: '#000000', + }, + width: '100%', + height: '100%', + autoResize: true, + nodes: { + shape: 'box', + color: '#89C4F4', + shapeProperties: { + borderRadius: 0, // only for box shape + }, + }, + physics: { + solver: 'forceAtlas2Based', + adaptiveTimestep: true, + stabilization: { + enabled: true, + iterations: 1, + updateInterval: 100, + onlyDynamicEdges: false, + fit: true, + }, + repulsion: { + nodeDistance: 250, + }, + }, + interaction: { + hover: true, + hoverConnectedEdges: false, + }, +}; + +/** + * @param dept {string} the Department name + * @return {object} + */ +const generateGroupObject = (dept) => { + // gen random color + + return { + color: {background: 'red'}, + }; +}; + +/** + * Takes the raw data received from the server, 'parser' it to add additional + * properties to the nodes. + * @param rawData + * @return {{data: object, departments: Set}} + */ +function parseGraphData(rawData) { + let graph = Object.assign({}, rawData); + let depts = new Set(); + + graph.nodes.forEach((node) => { + node.group = node.dept; + depts.add(node.dept); + }); + + return {graph, depts}; +} + +/** + * Wrapper to the Vis.js Graph. Handle additional Events. + */ +class GraphView extends Component { + static propTypes = { + classes: PropTypes.object.isRequired, + data: PropTypes.object.isRequired, + events: PropTypes.shape({ + select: PropTypes.func.isRequired, + hoverNode: PropTypes.func.isRequired, + blurNode: PropTypes.func.isRequired, + }).isRequired, + }; + + state = { + toolOpen: false, + toolNode: '', + popOpen: false, + popNode: '', + }; + + render() { + const {classes, data, events} = this.props; + + // Modify the graphData before passing to child component. + const {graph, depts} = parseGraphData(data); + + // Must inject the groups data into the options. + for (const department of depts) { + options.groups.myGroupId[department] = generateGroupObject(department); + } + + return ( +
+ +
+ ); + } +} + +export default withStyles(styles)(GraphView); diff --git a/components/graph/GraphViewAssembly.jsx b/components/graph/GraphViewAssembly.jsx new file mode 100644 index 0000000..861c73b --- /dev/null +++ b/components/graph/GraphViewAssembly.jsx @@ -0,0 +1,154 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; + +import filteredGraph from '../utils/filterAlgortithm'; +import GraphView from './GraphView'; +import SearchbarDrawer from '../searchbar/SearchbarDrawer'; +import CourseInfoCard from './CourseInfoCard'; + +/** + * GraphViewAssembly renders both drawers and the graph view. Takes a prop + * data. Which is the graph data of a school. + */ +class GraphViewAssembly extends Component { + static propTypes = { + data: PropTypes.object.isRequired, + }; + + state = { + graphData: {'nodes': [], 'edges': []}, + selectedIDs: [], + selectedNode: null, // the current node object on graph + }; + + /** + * @param selection + */ + updateSelected(selection) { + const {data} = this.props; + + let graph = filteredGraph(data.nodes, selection); + + // Update colors + // selection.forEach((selId) => { + // let needNewColorIndex = graph.nodes.findIndex((i) => i.id === selId); + // graph.nodes[needNewColorIndex].color = '#e04141'; + // }); + + this.setState({ + graphData: graph, + selectedIDs: selection, + }); + } + + /** + * @param nodeId {string} ? number I forgot + */ + selectNode(nodeId) { + const {selectedIDs} = this.state; + + // Skip if id is already selected + if (selectedIDs.includes(nodeId)) { + return; + } + + // Add to selection + update graph + let selected = selectedIDs.slice(); + selected.push(nodeId); + this.updateSelected(selected); + } + + /** + * @param nodeId {string} ? number I forgot + */ + deselectNode(nodeId) { + const {selectedIDs} = this.state; + + const index = selectedIDs.findIndex((element) => element === nodeId); + let selected = selectedIDs.slice(); + selected.splice(index, 1); + this.updateSelected(selected); + } + + /** + * WTF ???? + * @param nodeId + * @param event + */ + handleItemClick(nodeId, event) { + this.selectNode(nodeId); + } + + /** + * WTF ???? + * @param nodeId + * @param event + */ + handleSelectedClick(nodeId, event) { + this.deselectNode(nodeId); + } + + /** + * Passed to child component + * @type { + * {select: GraphView.events.select, + * hoverNode: GraphView.events.hoverNode, + * blurNode: GraphView.events.blurNode} + * } + */ + events = { + select: (event) => { + let {nodes} = event; + + this.setState({ + selectedNode: nodes.length > 0 ? this.getNode(nodes[0]) : null, + }); + }, + hoverNode: (event) => { + let {nodes} = event; + }, + blurNode: () => { + let {nodes} = event; + }, + }; + + /** + * @param id + * @returns {object} + */ + getNode(id) { + let arr = this.props.data.nodes; + let result = null; + + arr.forEach((node) => { + if (node.id == id) { // must use '==' instead of '===' + result = node; + } + }); + + return result; + } + + render() { + const {selectedNode} = this.state; + + return ( +
+ this.handleItemClick(event, id)} + selClick={(event, sel) => this.handleSelectedClick(event, sel)} + selected={this.state.selectedIDs} + /> + + {selectedNode && + } +
+ ); + } +} + +export default GraphViewAssembly; diff --git a/components/graph/GraphViewLoader.jsx b/components/graph/GraphViewLoader.jsx new file mode 100644 index 0000000..f76cf78 --- /dev/null +++ b/components/graph/GraphViewLoader.jsx @@ -0,0 +1,83 @@ +import React from 'react'; +import GraphViewAssembly from './GraphViewAssembly'; + +class LoadingMessage extends React.Component { + render() { + return ( +
+

Loading...

+ {this.props.url} +
+ ); + } +} +class LoadErrorMessage extends React.Component { + render() { + return ( +
+

Failed to load data from:

+ {this.props.url} +

{'' + this.props.error}

+
+ ); + } +} + +export default class GraphViewLoader extends React.Component { + constructor(props) { + super(props); + this.state = { + data: null, + dataLoadError: null, + }; + } + + setData(data) { + this.setState({ + data: { + nodes: data.nodes, + edges: data.edges, + }, + }); + } + + setDataError(error) { + this.setState({ + data: null, + dataLoadError: error, + }); + // alert(error); + } + + componentDidMount() { + console.log('Fetching data...'); + fetch(this.props.jsonDataUrl) + .then((response) => { + if (!response.ok) { + throw Error(response.statusText); + } + console.log('Got response'); + return response.json(); + }) + .then((data) => { + console.log('Got json data'); + this.setData(data); + }) + .catch((error) => { + console.log('Got error: ' + error); + this.setDataError(error); + }); + } + + render() { + if (this.state.dataLoadError) { + return ; + } + if (!this.state.data) { + return ; + } + return ; + } +} diff --git a/components/home/HomePanel.jsx b/components/home/HomePanel.jsx new file mode 100644 index 0000000..84da8f5 --- /dev/null +++ b/components/home/HomePanel.jsx @@ -0,0 +1,83 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import Router from 'next/router'; + +import { withStyles } from '@material-ui/core/styles'; +import Button from '@material-ui/core/Button'; +import Card from '@material-ui/core/Card'; +import CardActions from '@material-ui/core/CardActions'; +import CardContent from '@material-ui/core/CardContent'; +import Typography from '@material-ui/core/Typography'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + panel: { + 'maxWidth': 350, + 'position': 'absolute', + 'margin-left': 'auto', + 'margin-right': 'auto', + 'left': 0, + 'right': 0, + 'z-index': 100, + 'top': '50%', + 'transform': `translateY(-${50}%)`, + }, + button: { + 'margin': 'auto', + }, +}); + +/** + * @param href {string} + * @return {Function} + */ +const onClickHandler = (href) => (e) => { + e.preventDefault(); + Router.push(href); +}; + +/** + * @param classes + * @return {Element} + * @constructor + */ +const HomePanel = ({classes}) => { + return ( + + + + Course Graph + + + v2.0.0 + + + 🏅 A dynamic, browser based visualization course planner. + Designed to help students with course planning. + + + + + + + + ); +}; + +HomePanel.propTypes = { + classes: PropTypes.object, +}; + +export default withStyles(styles)(HomePanel); diff --git a/components/home/HomePanel.test.jsx b/components/home/HomePanel.test.jsx new file mode 100644 index 0000000..034bce2 --- /dev/null +++ b/components/home/HomePanel.test.jsx @@ -0,0 +1,11 @@ +import React from 'react'; +import { shallow } from 'enzyme'; + +import HomePanel from './HomePanel'; + +describe('A HomePanel', () => { + it('should render default header without throwing an error', () => { + const wrapper = shallow(); + expect(wrapper.text()).toEqual(''); + }); +}); diff --git a/components/login/Login.jsx b/components/login/Login.jsx new file mode 100644 index 0000000..e5dfc6c --- /dev/null +++ b/components/login/Login.jsx @@ -0,0 +1,100 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import { withStyles } from '@material-ui/core/styles'; + +import TextField from '@material-ui/core/TextField'; +import Button from '@material-ui/core/Button'; + +import fetch from 'isomorphic-unfetch'; + +const styles = theme => ({ + container: { + display: 'flex', + flexWrap: 'wrap', + }, + card: { + minWidth: 275, + }, + button: { + margin: theme.spacing.unit * 3, + }, +}); + +/** + * Login Component that provide text fields and submit button. + * @inheritDoc + */ +class Login extends React.Component { + static propTypes = { + // classes: PropTypes.object.isRequired, + }; + + state = { + email: 'email', + password: '', + }; + + // Helpers + handleChange = (email, password) => (event) => { + this.setState({ + [email]: event.target.value, + [password]: event.target.value, + }); + }; + + handleSubmit = async (event) => { + event.preventDefault(); + + let data = { + email: this.state.email, + password: this.state.password, + }; + + // https://coursegraph.org/account/login + await fetch('http://localhost:8080/account/login', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(data), + }); + }; + + /** + * @return {Element} + */ + render() { + return (
+
+ + + + + + +
); + } +} + +export default withStyles(styles)(Login); diff --git a/components/searchbar/FloatingActionButton.jsx b/components/searchbar/FloatingActionButton.jsx new file mode 100644 index 0000000..bf5f45f --- /dev/null +++ b/components/searchbar/FloatingActionButton.jsx @@ -0,0 +1,56 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import { withStyles } from '@material-ui/core/styles'; +import Button from '@material-ui/core/Button'; +import SearchIcon from '@material-ui/icons/Search'; +import Draggable from 'react-draggable'; + +/** + * Material UI theme styles. This Button can float above other components. + * @param theme + * @return {{ + * extendedIcon: {marginRight: number}, + * absolute: {position: string, top: number, left: number} + * }} + */ +const styles = theme => ({ + extendedIcon: { + 'marginRight': theme.spacing.unit, + }, + absolute: { + 'position': 'absolute', + 'top': theme.spacing.unit * 4, + 'left': theme.spacing.unit * 6, + }, +}); + +/** + * @param props + * @return {Element} + * @constructor + */ +class FloatingActionButtons extends React.Component { + static propTypes = { + classes: PropTypes.object.isRequired, + buttonClick: PropTypes.func.isRequired, + }; + + render() { + const {classes} = this.props; + + return ( + + + + ); + } +} + +export default withStyles(styles)(FloatingActionButtons); diff --git a/components/searchbar/SearchBar.jsx b/components/searchbar/SearchBar.jsx new file mode 100644 index 0000000..f3abafb --- /dev/null +++ b/components/searchbar/SearchBar.jsx @@ -0,0 +1,50 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; +import { withStyles } from '@material-ui/core/styles'; + +import TextField from '@material-ui/core/TextField'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + textField: { + marginLeft: theme.spacing.unit, + marginRight: theme.spacing.unit, + width: '90%', + }, +}); + +/** + * Simple Input interface that prompts input from user. + */ +class SearchBar extends Component { + static propTypes = { + classes: PropTypes.object.isRequired, + }; + + state = { + value: '', + }; + + onChange = (event) => { + this.setState({value: event.target.value}); + this.props.onChange(event.target.value); + }; + + render() { + const {classes} = this.props; + + return ( + ); + } +} + +export default withStyles(styles)(SearchBar); diff --git a/components/searchbar/SearchResultList.jsx b/components/searchbar/SearchResultList.jsx new file mode 100644 index 0000000..27cf08f --- /dev/null +++ b/components/searchbar/SearchResultList.jsx @@ -0,0 +1,71 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; +import { withStyles } from '@material-ui/core/styles'; + +import List from '@material-ui/core/List'; +import ListItem from '@material-ui/core/ListItem'; +import ListItemText from '@material-ui/core/ListItemText'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + listdiv: { + overflow: 'auto', + maxHeight: '400px', + maxWidth: '300px', + }, +}); + +/** + * List container that display the search result + */ +class SearchResultList extends Component { + static propTypes = { + classes: PropTypes.object.isRequired, + courses: PropTypes.array.isRequired, + }; + + state = { + visibleElements: 40, + }; + + // This is here because it needs to get scrollHeight and scrollTop of a div, + // taken any higher this is not possible. + onListScroll = (event) => { + const el = document.getElementById('listDiv'); // WTF???????? + + if ((el.scrollHeight - el.scrollTop) < 810) { + let newVisibleElements = this.state.visibleElements + 40; + this.setState({ + visibleElements: newVisibleElements, + }); + } + }; + + render() { + const {classes} = this.props; + + let courses = this.props.courses.slice(0, this.state.visibleElements); + + return ( +
+ {courses.map((course) => ( + + + + ))} +
+ ); + } +} + +export default withStyles(styles)(SearchResultList); diff --git a/components/searchbar/SearchbarAssembly.jsx b/components/searchbar/SearchbarAssembly.jsx new file mode 100644 index 0000000..5234a22 --- /dev/null +++ b/components/searchbar/SearchbarAssembly.jsx @@ -0,0 +1,65 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; + +import { withStyles } from '@material-ui/core/styles'; +import Divider from '@material-ui/core/Divider'; + +import { levenshtein, match } from '../utils/levenshtien'; +import SearchResultList from './SearchResultList'; +import SearchBar from './SearchBar'; + +const styles = theme => ({ + container: { + 'max-width': 300, + }, +}); + + +class SearchbarAssembly extends Component { + static propTypes = { + classes: PropTypes.object.isRequired, + courses: PropTypes.array.isRequired, + }; + + state = { + searchQuery: '', + }; + + tempArrayA = []; + tempArrayB = []; + + updateSearch(search) { + this.setState({ + searchQuery: search, + }); + } + + render() { + const {classes, courses} = this.props; + + const searchQuery = this.state.searchQuery; + let filteredData = courses.filter(match(searchQuery)); + + filteredData.forEach((course) => { + course.priority = levenshtein( + searchQuery.toLowerCase(), + course.searchString.toLowerCase(), + this.tempArrayA, + this.tempArrayB + ); + }); + + filteredData.sort((a, b) => a.priority > b.priority); + + return ( +
+ this.updateSearch(search)}/> + + this.props.itemClick(event, id)}/> +
+ ); + } +} + +export default withStyles(styles)(SearchbarAssembly); diff --git a/components/searchbar/SearchbarDrawer.jsx b/components/searchbar/SearchbarDrawer.jsx new file mode 100644 index 0000000..e42dd52 --- /dev/null +++ b/components/searchbar/SearchbarDrawer.jsx @@ -0,0 +1,55 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; + +import Drawer from '@material-ui/core/Drawer'; +import Divider from '@material-ui/core/Divider'; + +import SearchbarAssembly from '../searchbar/SearchbarAssembly'; +import SelectedList from '../searchbar/SelectedList'; +import FloatingActionButton from '../searchbar/FloatingActionButton'; + +/** + * Component that you can trigger a button to open the drawer. + */ +class SearchbarDrawer extends Component { + static propTypes = { + classes: PropTypes.object, + courses: PropTypes.array.isRequired, + selected: PropTypes.array.isRequired, + itemClick: PropTypes.func.isRequired, + selClick: PropTypes.func.isRequired, + }; + + state = { + isOpen: false, + }; + + toggleDrawer = (open) => () => { + this.setState({ + isOpen: open, + }); + }; + + render() { + const {courses, selected, selClick, itemClick} = this.props; + + return ( +
+ + + itemClick(event, id)}/> + + selClick(event, sel)}/> + +
+ ); + } +} + +export default SearchbarDrawer; diff --git a/components/searchbar/SelectedList.jsx b/components/searchbar/SelectedList.jsx new file mode 100644 index 0000000..3461d84 --- /dev/null +++ b/components/searchbar/SelectedList.jsx @@ -0,0 +1,68 @@ +import React, { Component } from 'react'; +import PropTypes from 'prop-types'; + +import Paper from '@material-ui/core/Paper'; +import Chip from '@material-ui/core/Chip'; +import { withStyles } from '@material-ui/core/styles/index'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + select: { + maxWidth: '300px', + overflow: 'auto', + maxHeight: '300px', + }, + text: { + fontSize: 10, + }, +}); + +/** + * Generate a list of Elements + * @param classes + * @param courses + * @param selClick + * @param selected + * @return {Element} + */ +const createChips = (classes, courses, selClick, selected) => + selected.map((course) => ( + + )); + +/** + * An Chip container that user can select and deselect nodes on the graph + */ +class SelectedList extends Component { + static propTypes = { + classes: PropTypes.object, + courses: PropTypes.array.isRequired, + selClick: PropTypes.func.isRequired, + selected: PropTypes.array.isRequired, + }; + + render() { + const {classes, courses, selClick, selected} = this.props; + + return ( +
+ + {createChips(classes, courses, selClick, selected)} + +
+ ); + } +} + +export default withStyles(styles)(SelectedList); diff --git a/components/utils/GraphSelection.js b/components/utils/GraphSelection.js new file mode 100644 index 0000000..e54b0f2 --- /dev/null +++ b/components/utils/GraphSelection.js @@ -0,0 +1,257 @@ +//Graph Selection class + +class GraphSelection { + constructor(nodes, ids) { + this.nodeMap = new Map(); + this.edgeMap = new Map(); + //Leaves them as empty maps if not args provided + if (typeof (nodes) === 'object' && typeof (ids) !== 'undefined') { + buildGraph(nodes, ids, this.nodeMap, this.edgeMap); + } + } + + addNodes(nodes, ids) { + buildGraph(nodes, ids, this.nodeMap, this.edgeMap); + } + + removeNodes(nodes, ids) { + cleanGraph(nodes, ids, this.nodeMap, this.edgeMap); + } + + clear() { + this.nodeMap.clear(); + this.edgeMap.clear(); + } + + getGraphData() { + const graphEdges = this.edgeMap.values(); + const graphNodes = this.nodeMap.values(); + const graphData = { + edges: Array.from(graphEdges), + nodes: Array.from(graphNodes), + }; + return graphData; + } + + + +} + + + +function addFromEdges(nodes, id, nodeMap, edgeMap) { + const edgesFrom = nodes[id].edges_from; + + edgesFrom.forEach((fromID, index) => { + //Give every edge a unique key + edgeMap.set( + `${fromID}_${id}`, + {'from': fromID, 'to': id} + ); + if (!nodeMap.has(fromID)) { + nodeMap.set(fromID, nodes[fromID]); + addFromEdges(nodes, fromID, nodeMap, edgeMap); + } + }); +} + +function buildGraph(nodes, ids, nodeMap, edgeMap) { + + if (typeof (ids) === 'number') { + if (!nodeMap.has(ids)) { + nodeMap.set(ids, nodes[ids]); + addFromEdges(nodes, ids, nodeMap, edgeMap); + } + } else { + ids.forEach((id) => { + if (!nodeMap.has(id)) { + nodeMap.set(id, nodes[ids]); + addFromEdges(nodes, id, nodeMap, edgeMap); + } + }); + } + console.log(edgeMap); +} + +function cleanFromEdges(nodes, id, nodeMap, edgeMap) { + const edgesFrom = nodes[id].edges_from; + const edgesTo = nodes[id].edges_to; + nodeMap.delete(id); + //console.log(`recieved edges from: ${edgesFrom}`); + + //Delete edges by their unique key + edgesTo.forEach((toID, index) => { + console.log(`removing key: ${index}_${id}`); + edgeMap.delete(`${index}_${id}`); + }); + edgesFrom.forEach((fromID, index) => { + console.log(`removing key: ${id}_${index}`); + edgeMap.delete(`${fromID}_${id}`); + if (nodeMap.delete(fromID)) { + cleanFromEdges(nodes, fromID, nodeMap, edgeMap); + } + }); + +} + +function cleanGraph(nodes, ids, nodeMap, edgeMap) { + console.log('in clean'); + if (typeof (ids) === 'number') { + if (nodeMap.has(ids)) { + console.log('number clean'); + cleanFromEdges(nodes, ids, nodeMap, edgeMap); + } + } else { + console.log('somehow here'); + ids.forEach( (id) => { + if (nodeMap.has(id)) { + cleanFromEdges(nodes, id, nodeMap, edgeMap); + } + }); + } +} +//export default GraphSelection; + +const nodes = [ + { + 'dept': 'life', + 'description': 'being able to make good use of college', + 'edges_from': [ + 4, + 5, + ], + 'edges_to': [ + 1, + ], + 'id': 0, + 'label': 'CL 101', + 'title': 'How to College', + }, + { + 'dept': 'life', + 'description': 'Doing more advanced college stuff', + 'edges_from': [ + 0, + ], + 'edges_to': [ + 2, + 3, + ], + 'id': 1, + 'label': 'CL 102', + 'title': 'Advanced college', + }, + { + 'dept': 'life', + 'description': 'Welcome to the real world ya pansy!', + 'edges_from': [ + 1, + ], + 'edges_to': [], + 'id': 2, + 'label': 'RL 7', + 'title': 'REAL LIFE', + }, + { + 'dept': 'Life', + 'description': 'Your educated, now getting rich', + 'edges_from': [ + 1, + ], + 'edges_to': [], + 'id': 3, + 'label': 'RL M2', + 'title': 'Getting Money', + }, + { + 'dept': 'PD', + 'description': 'You are not that important, accept it', + 'edges_from': [ + 8, + ], + 'edges_to': [ + 0, + ], + 'id': 4, + 'label': 'PD 42', + 'title': 'Getting over yourself', + }, + { + 'dept': 'HS', + 'description': 'A totally wonderful time', + 'edges_from': [ + 6, + 7, + ], + 'edges_to': [ + 0, + ], + 'id': 5, + 'label': 'HS K', + 'title': 'High School stuff', + }, + { + 'dept': 'HS', + 'description': 'seeing how terrible people, especially kids, are', + 'edges_from': [ + 8, + ], + 'edges_to': [ + 5, + ], + 'id': 6, + 'label': 'HS 69', + 'title': 'Petty School Drama', + }, + { + 'dept': 'PD', + 'description': 'the essential stage of life that is childhood', + 'edges_from': [], + 'edges_to': [ + 5, + ], + 'id': 7, + 'label': 'PD 3', + 'title': 'Kid stuff', + }, + { + 'dept': 'PD', + 'description' : 'we have all been *that* person before', + 'edges_from' : [], + 'edges_to' : [ + 4, + 6, + ], + 'id' : 8, + 'label' : 'PD FU', + 'title' : 'Being a snot nosed brat', + }, + { + 'dept': '=/', + 'description': 'question mark', + 'edges_from': [], + 'edges_to': [], + 'id': 9, + 'label': 'whatevs', + 'title': 'whatevs', + }, +]; + +function outPut(graph) { + const data = graph.getGraphData(); + console.log('NEW OUTPUT SEGMENT STARTS HERE!'); + data.nodes.forEach( (x) => { + console.log(x.id); + }); + console.log(data.edges); +} + +let myGraph = new GraphSelection(nodes, 0); +outPut(myGraph); + +//myGraph.addNodes(nodes, 3); +//outPut(myGraph); + +myGraph.removeNodes(nodes, 5); +myGraph.clear(); +outPut(myGraph); diff --git a/components/utils/filterAlgortithm.js b/components/utils/filterAlgortithm.js new file mode 100644 index 0000000..c1172dc --- /dev/null +++ b/components/utils/filterAlgortithm.js @@ -0,0 +1,191 @@ +const nodes = [ + { + 'dept': 'life', + 'description': 'being able to make good use of college', + 'edges_from': [ + 4, + 5, + ], + 'edges_to': [ + 1, + ], + 'id': 0, + 'label': 'CL 101', + 'title': 'How to College', + }, + { + 'dept': 'life', + 'description': 'Doing more advanced college stuff', + 'edges_from': [ + 0, + ], + 'edges_to': [ + 2, + 3, + ], + 'id': 1, + 'label': 'CL 102', + 'title': 'Advanced college', + }, + { + 'dept': 'life', + 'description': 'Welcome to the real world ya pansy!', + 'edges_from': [ + 1, + ], + 'edges_to': [], + 'id': 2, + 'label': 'RL 7', + 'title': 'REAL LIFE', + }, + { + 'dept': 'Life', + 'description': 'Your educated, now getting rich', + 'edges_from': [ + 1, + ], + 'edges_to': [], + 'id': 3, + 'label': 'RL M2', + 'title': 'Getting Money', + }, + { + 'dept': 'PD', + 'description': 'You are not that important, accept it', + 'edges_from': [ + 8, + ], + 'edges_to': [ + 0, + ], + 'id': 4, + 'label': 'PD 42', + 'title': 'Getting over yourself', + }, + { + 'dept': 'HS', + 'description': 'A totally wonderful time', + 'edges_from': [ + 6, + 7, + ], + 'edges_to': [ + 0, + ], + 'id': 5, + 'label': 'HS K', + 'title': 'High School stuff', + }, + { + 'dept': 'HS', + 'description': 'seeing how terrible people, especially kids, are', + 'edges_from': [ + 8, + ], + 'edges_to': [ + 5, + ], + 'id': 6, + 'label': 'HS 69', + 'title': 'Petty School Drama', + }, + { + 'dept': 'PD', + 'description': 'the essential stage of life that is childhood', + 'edges_from': [], + 'edges_to': [ + 5, + ], + 'id': 7, + 'label': 'PD 3', + 'title': 'Kid stuff', + }, + { + 'dept': 'PD', + 'description' : 'we have all been *that* person before', + 'edges_from' : [], + 'edges_to' : [ + 4, + 6, + ], + 'id' : 8, + 'label' : 'PD FU', + 'title' : 'Being a snot nosed brat', + }, + { + 'dept': '=/', + 'description': 'question mark', + 'edges_from': [], + 'edges_to': [], + 'id': 9, + 'label': 'whatevs', + 'title': 'whatevs', + }, +]; + +function doFromEdges(nodes, id, newNodes, newEdges) { + const edgesFrom = nodes[id].edges_from; + + edgesFrom.forEach((fromID) => { + newEdges.push({ + 'from': fromID, + 'to': id, + }); + if (!newNodes.has(fromID)) { + newNodes.set(fromID, false); + doFromEdges(nodes, fromID, newNodes, newEdges); + } + }); +} + +/** + * @param nodes Array. + * @param ids Array. + * @returns {{edges: Array, nodes: Array}} + */ + +function filteredGraph(nodes, ids) { + let newNodes = new Map(); + let edgeList = []; + + if (typeof (ids) === 'number') { + if (!newNodes.has(ids)) { + newNodes.set(ids, false); + doFromEdges(nodes, ids, newNodes, edgeList); + } + } else { + ids.forEach((id) => { + if (!newNodes.has(id)) { + newNodes.set(id, false); + doFromEdges(nodes, id, newNodes, edgeList); + } + }); + } + + + //console.log(edgeList); + //console.log(newNodes); + const graphNodes = []; + newNodes.forEach((_, id) => { graphNodes.push(nodes[id]) }); + const newGraph = { + 'edges' : edgeList, + 'nodes': graphNodes, + }; + + // console.log('nodelist:'); + // console.log(newNodes); + // console.log('edges'); + // console.log(edgeList); + + return newGraph; +} + +//shitty output tests? +//const graph = filteredGraph(nodes, [0, 3]); + +//console.log('GRAPH:'); +//console.log(graph); +//filteredGraph(nodes, 10) + + +export default filteredGraph; diff --git a/components/utils/levenshtien.js b/components/utils/levenshtien.js new file mode 100644 index 0000000..ee7d821 --- /dev/null +++ b/components/utils/levenshtien.js @@ -0,0 +1,61 @@ + + +export function levenshtein(q, s, A, B) { + let n = q.length; + let m = s.length; + + A.length = n + 1; + B.length = n + 1; + for (let i = n + 1; i-- > 0;) { + A[i] = i; + B[i] = 0; + } + for (let j = 0; j < m; ++j) { + let x = j; + for (let i = 0; i < n; ++i) { + x = B[i + 1] = Math.min( + Math.min(x, A[i + 1]) + 1, + q[i] != s[j] ? A[i] + 1 : 0); + } + let C = A; + A = B; + B = C; + } + return A[n]; +} + +export function match(q) { + return (course) => { + course.searchString = (course.label + course.title + course.descr).toLowerCase(); + return fuzzyMatch(q.toLowerCase(), course.searchString); + }; +} + + +function fuzzyMatch(q, s) { + let i = s.length; + let j = q.length; + while (j !== 0 && i >= j) { + if (s[i - 1] === q[j - 1]) { + --j; + } + --i; + } + return j === 0; +} + + +// def lev (a, b): +// n, m = len(a), len(b) +// row, prev = [0] * (n + 1), [0] * (n + 1) +// for i in range(n): +// prev[i] = i +// for j in range(m): +// x = j +// for i in range(n): +// x = row[i + 1] = min( +// min(x, prev[i + 1]) + 1, +// prev[i] + 1 if a[i] != b[j] else 0) +// row, prev = prev, row +// return prev[n] + diff --git a/conf.json b/conf.json new file mode 100644 index 0000000..cdf133d --- /dev/null +++ b/conf.json @@ -0,0 +1,32 @@ +{ + "templates": { + "applicationName": "Demo", + "openGraph": { + "title": "", + "type": "website", + "image": "", + "site_name": "", + "url": "" + }, + "meta": { + "title": "", + "description": "", + "keyword": "" + }, + "source": { + "include": [ + "./src/" + ], + "includePattern": ".+\\.js(doc|x)?$", + "excludePattern": "(^|\\/|\\\\)_" + }, + "opts": { + "encoding": "utf8", + "recurse": true, + "private": false, + "lenient": true, + "destination": "./docs", + "template": "./node_modules/@pixi/jsdoc-template" + } + } +} diff --git a/crawlers/README.md b/crawlers/README.md new file mode 100644 index 0000000..6d47369 --- /dev/null +++ b/crawlers/README.md @@ -0,0 +1,11 @@ + +## To run the pisa web crawler: + cd crawlers/ucsc + mkdir logs + time scrapy crawl pisa --logfile pisa.log -o output.json + +## To create a new crawler: + cd crawlers/ucsc + scrapy genspider + +Then open crawlers > ucsc > ucsc (this subdirectory is unavoidable) > spiders > \.py diff --git a/crawlers/d-crawler/.gitignore b/crawlers/d-crawler/.gitignore new file mode 100644 index 0000000..3b20378 --- /dev/null +++ b/crawlers/d-crawler/.gitignore @@ -0,0 +1,14 @@ +.dub +docs.json +__dummy.html +docs/ +crawler-demo.so +crawler-demo.dylib +crawler-demo.dll +crawler-demo.a +crawler-demo.lib +crawler-demo-test-* +*.exe +*.o +*.obj +*.lst diff --git a/crawlers/d-crawler/course_regex_replacements.txt b/crawlers/d-crawler/course_regex_replacements.txt new file mode 100644 index 0000000..a1c9d71 --- /dev/null +++ b/crawlers/d-crawler/course_regex_replacements.txt @@ -0,0 +1,71 @@ +// Detect unmatched text: +\n[^"][^\n]+\n + +// Remove parentheses +\.([\)"]+) +$1\. + +// Detect abbreviations +([A-Z][a-z]*\.\s*)+[a-z] +// then remove w/ result.replace(".","") + +// Simple abbreviation remover: +([A-Z][a-z]*\.\s*)+([a-z]) +$2 + +https?://[\w\d\-\/\.]+/([a-z]+).html +"dept"="$1" + +\n(?:\[Return to top\]) +N/A + +\*\s+Not offered in \d+\-\d+\n +N/A + +\nRevised:\s+(\d+/\d+/\d+)\n +\n"revision_date" = "$1" + +\n\s*(\d+\-\d+\s+(?:General\s+)?Catalog) +\n"catalog_version" = "$1" + +\n([A-Za-z\-\s]+)\s+Courses\n +\n"division"="$1"\n + +\n(?:Department of (?:the\s+)?([A-Za-z]+(?:\s+[A-Za-z]+)*))?\s*([^\n]+)\n(?:(?:Faculty|Program Statement|[\s\w]*Courses?[\s\w]+)|\s*\|\s*)+\n +\n"department_title" = "$1"\n"contact_info"="$2"\n + +\n(\d+[A-Z]?)\.\s+ +\n"course_id" = "$1"\n + +"\n([^\.]+)\.\s+ +"\n"course_title" = "$1"\n + +"\n([FWS](?:,[FWS])*|\*)\s+ +"\n"course_terms" = "$1"\n + +\s*((?:[A-Z](?:\.|[a-z]+,?)(?:\s+|\-))+[A-Z][a-z]*)\n +\n"course_instructor" = "$1"\n + +\s*\(General Education Code\(s\):\s+([A-Z\-,\s]+)\)\.\n +\n"ge" = "$1"\n + +\s*Offered in alternate academic years.\n +\n"offered_in_alt_academic_years" = "true"\n + +\s*May be repeated for credit.\n +\n"is_repeatable_for_credit" = "true"\n + +\s+Enrollment limited to (\d+)\.\n +\n"enroll_limit" = $1\n + +\s*Enrollment (?:is )?restricted to ([^\.]+)\.\n +\n"enroll_restrict" = "$1"\n + +Concurrent enrollment in ([^\.(?:is)]+)\s+(?:is required|required)\.\n +\n"concurrent_req" = "$1"\n + +\s*Prerequisite\(?s?\)?:\s+([^\.]+)\.\s*\n +\n"prereqs" = "$1"\n + +\n([^"][^\n]+)\n +\n"course_description = "$1"\n diff --git a/crawlers/d-crawler/dub.sdl b/crawlers/d-crawler/dub.sdl new file mode 100644 index 0000000..c463e1d --- /dev/null +++ b/crawlers/d-crawler/dub.sdl @@ -0,0 +1,9 @@ +name "crawler-demo" +description "Prototyping a distributed crawler built on vibe.d" +authors "Seiji Emery" +copyright "Copyright © 2018, Seiji Emery" +license "MIT" +#dependency "vibe-d" version="~>0.8.4" +dependency "arsd-official:dom" version="~>2.1.1" +dependency "arsd-official:htmltotext" version="~>2.1.1" +dependency "jsonizer" version="~>0.7.6" \ No newline at end of file diff --git a/crawlers/d-crawler/source/app.d b/crawlers/d-crawler/source/app.d new file mode 100644 index 0000000..05be671 --- /dev/null +++ b/crawlers/d-crawler/source/app.d @@ -0,0 +1,72 @@ +import std.stdio; +import std.net.curl: get, CurlException; +import std.format: format; +import std.exception: enforce; +import std.parallelism: parallel, defaultPoolThreads; +import std.getopt: getopt; +import std.string: toUpper, strip; +import std.regex: matchFirst, ctRegex; +import std.conv: parse; +import arsd.dom; +import arsd.htmltotext: htmlToText; +import course_data: CourseEntry; +import department_info: fetchDepartment, DepartmentInfo; +import core.sync.mutex; +import jsonizer; + +__gshared Mutex mutex; +__gshared DepartmentInfo[string] data; +shared static this () { mutex = new Mutex(); } + +void submit (DepartmentInfo dept) { + synchronized (mutex) { + data[dept.departmentId] = dept; + } +} + +// regex: \n\s+(\d+\w?)\.\s+([\w+\s+\-:,/\'\"]+)(?:\s+\((\d+)\s+credits?|no credit\))?\.(?:\s+([FWS\*,]+))?\s+(.+) +// replace: {\n\t"course_id": "$1",\n\t"course_title": "$2",\n\t"credit(s)": "$3",\n\t"offered term(s)": "$4",\n\t"description": "$5"\n},\n + +void processRegistrarCoursePage (string dept) { + writefln("Fetching data for dept '%s'", dept); + auto result = fetchDepartment("https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses", dept); + writefln("%s course(s), %s faculty member(s)", + result.courses.length, result.faculty.length); + submit(result); + //writefln("\n%s", result); +} + +void main(string[] args) +{ + bool runParallel = false; + size_t numThreads = 16; + string outputFile = "data.json"; + args.getopt( + "parallel", &runParallel, + "nthreads", &numThreads, + "o", &outputFile); + + remove("raw_courses_html.txt"); + remove("raw_courses_text.txt"); + + string[] depts = [ + "acen", "anth", "aplx", "art", "artg", "havc", "arts", "astr", "bioc", "eeb", "mcdb", "mcdb", "chem", "chin", "clst", "cogs", "clni", "clte", "cmmu", "cowl", "cres", "crwn", "danm", "eart", "east", "econ", "educ", "ams", "beng", "bme", "cmpm", "cmpe", "cmps", "ee", "engr", "tim", "envs", "fmst", "film", "fren", "germ", "gmst", "gree", "hebr", "his", "havc", "hisc", "humn", "ital", "itst", "japn", "jwst", "krsg", "laal", "lnst", "latn", "lals", "lgst", "ling", "lit", "ocea", "math", "merr", "metx", "musc", "oaks", "ocea", "phil", "pbs", "phye", "phys", "poli", "prtr", "port", "psyc", "punj", "qsex", "crsn", "reli", "russ", "scic", "sced", "socd", "socs", "socy", "sphs", "spst", "stev", "sust", "thea", "ucdc", "writ", "yidd" + ]; + if (runParallel) { + defaultPoolThreads = 32; + foreach (dept; parallel(depts)) { + processRegistrarCoursePage(dept); + } + } else { + foreach (dept; depts) { + processRegistrarCoursePage(dept); + } + } + import std.file: write; + import std.algorithm: map; + import std.array; + import std.conv: to; + write(outputFile, data.toJSONString); + + //write("data.json", format("{ %s }", data.map!"a.to!string".join(", "))); +} diff --git a/crawlers/d-crawler/source/course_data.d b/crawlers/d-crawler/source/course_data.d new file mode 100644 index 0000000..94bf5c1 --- /dev/null +++ b/crawlers/d-crawler/source/course_data.d @@ -0,0 +1,51 @@ +module course_data; +import std.format: format; + +struct CourseEntry { + string name; + string title; + int credits; + string quartersOffered; + string departmentTitle; + string division; + string rawDescription; + string description; + string instructor; + string prereqs; + string coreqs; + bool gradOnly = false; + bool requiresInstructorPermission = false; + bool mayBeRepeatedForCredit = false; + bool satisfiesAmericanHistoryReq = false; + string enrollmentRestrictions; + string geCategories; + string courseAlias; + int enrollLimit = 0; + + + string toString () { + return format(` + { + "course_name": "%s", + "course_title": "%s", + "department": "%s", + "credits": "%d", + "terms": "%s", + "division": "%s", + "instructor": "%s", + "description": "%s", + "prereqs": "%s", + "coreqs": "%s", + "enrollment_restrictions": "%s", + "requires_instructor_permission": "%s", + "repeatable_for_credit": "%s", + "satisfies_american_history_and_institutions_req": "%s", + "alias": "%s", + "ge_categories": "%s", + "enroll_limit": %d, + "raw_description": "%s", + },`, name, title, departmentTitle, credits, quartersOffered, division, instructor, description, + prereqs, coreqs, enrollmentRestrictions, requiresInstructorPermission, + mayBeRepeatedForCredit, satisfiesAmericanHistoryReq, courseAlias, geCategories, enrollLimit, rawDescription); + } +} \ No newline at end of file diff --git a/crawlers/d-crawler/source/department_info/fetch_courses.d b/crawlers/d-crawler/source/department_info/fetch_courses.d new file mode 100644 index 0000000..e0cb21c --- /dev/null +++ b/crawlers/d-crawler/source/department_info/fetch_courses.d @@ -0,0 +1,126 @@ +module department_info.fetch_courses; +import department_info.model; +import department_info.parse_utils; +import util.fetch_html: fetchHtml; +import util.search_utils: childRange, regexMatch; +import std.stdio; +import std.regex; +import std.exception: enforce; +import std.string: strip, toLower; +import std.array: replace; +import arsd.dom; + +DepartmentInfo fetchCourses (DepartmentInfo dept) { + fetchHtml(dept.coursesUrl, dept.error, (Document document) { + auto main = document + .requireSelector("body") + .requireSelector("div[id=wrap]") + .requireSelector("div[id=container]") + .requireSelector("div[id=sprflt]") + .requireSelector("div[id=main]"); + + dept.departmentName = main + .requireSelector("h1[id=title]") + .innerText; + + auto content = main.requireSelector("div[class~=content]"); + + auto text = content.innerText; + + + import std.file; + append("raw_courses_html.txt", format("\n%s\n%s\n", dept.coursesUrl, content.innerHTML)); + append("raw_courses_text.txt", format("\n%s\n%s\n", dept.coursesUrl, content.innerText)); + + auto sections = content.childRange + .splitSectionsByHeaders; + + foreach (section, items; sections) { + if (auto match = matchFirst(section, ctRegex!`([\w\-]+(?:\s+[\w\-])*)\s+Courses`)) { + section = match[1].toLower; + } else { + writefln("Non-matching section: '%s'", section); + continue; + } + + //writefln("Section %s:", section); + foreach (item; items) { + //writefln("\t%s", item.innerText); + auto text = item.innerText.strip(); + if (text == "" || matchFirst(text, ctRegex!`(\* Not offered in|\[Return to top\])`)) { continue; } + if (auto match = matchFirst(text, ctRegex!`Revised:\s+([^\n]+)`)) { + dept.lastCourseRevisionDate = match[1]; + continue; + } + + //size_t i = 0; + //writefln("%d: %s\n", ++i, text); + auto courseNumber = matchFirst(text, ctRegex!`(\d+[A-Z]?)\.(?:\s+|$)`); + //enforce(courseNumber, format("Could not match course number in '%s'", text)); + if (!courseNumber) { + writefln("Could not match course number in '%s'", text); + continue; + } + + + string name = dept.departmentId ~ " " ~ courseNumber[1]; + text = courseNumber.post; + + //writefln("%d: %s\n", ++i, text); + text = text.replace("U.S.", "US"); + + //writefln("%d: %s\n", ++i, text); + string title, units, terms; + if (text.length) { + auto match = matchFirst(text, ctRegex!`([^\.]+)(?:\s+\((\d+)\s+units?\))?\.(?:\s+|$)`); + if (!match && ((match = matchFirst(text, ctRegex!`([FWS](?:,[FWS])*|\*)?\s*`)))) { + terms = match[1].replace(",",""); + text = match.post; + } else { + enforce(match, format("Could not match course title in '%s'", text)); + title = match[1]; + units = match[2] ? match[2] : "-1"; + text = match.post; + + //writefln("%d: %s\n", ++i, text); + if (!!(match = matchFirst(text, ctRegex!`([FWS](?:,[FWS])*|\*)?\s*`))) { + terms = match[1].replace(",",""); + text = match.post; + } + } + } + + //writefln("%d: %s\n", ++i, text); + string geCodes = null; + if (auto match = matchFirst(text, ctRegex!(`\s+\(General Education Code\(s\):\s+([^\.\)]+)[\.\)]+`, "g"))) { + geCodes = match[1]; + text = match.pre ~ match.post; + } + //writefln("%d: %s\n", ++i, text); + //auto instructorMatch = matchFirst(text, ctRegex!`(?:\.\)?\s+|^)([^\.]+)\.?\s*$`); + string instructor = null; + if (text && text.length) { + + // see this stupid thing here? + // \.["\)]? + // blame english style guides (or lack thereof...). (ie. `(fubar.) `"Baz."` etc...) + + auto instructorMatch = matchFirst(text, ctRegex!`(?:\.["\)]?\s+|^)([^\.]+)\.?\s*$`); + enforce(instructorMatch, format("Could not match instructor in '%s'", text)); + instructor = instructorMatch[1]; + text = instructorMatch.pre; + //writefln("%d: %s\n", ++i, text); + } + //writefln("\t%s '%s' (%s units). '%s'. '%s'. %s", name, title, units, instructor, terms, text); + + if (name in dept.courses) { + writefln("'%s' already exists in deps.courses!", name); + } + dept.courses[name] = DepartmentInfo.CourseListing( + name, title, section, terms, instructor, text, geCodes + ); + } + } + }); + return dept; +} diff --git a/crawlers/d-crawler/source/department_info/fetch_faculty.d b/crawlers/d-crawler/source/department_info/fetch_faculty.d new file mode 100644 index 0000000..2956653 --- /dev/null +++ b/crawlers/d-crawler/source/department_info/fetch_faculty.d @@ -0,0 +1,55 @@ +module department_info.fetch_faculty; +import department_info.model; +import util.fetch_html: fetchHtml; +import util.search_utils: childRange, regexMatch; +import std.stdio; +import std.regex; +import std.exception: enforce; +import std.string: strip; +import arsd.dom; + +DepartmentInfo fetchFaculty (DepartmentInfo dept) { + fetchHtml(dept.facultyUrl, dept.error, (Document document) { + auto main = document + .requireSelector("body") + .requireSelector("div[id=wrap]") + .requireSelector("div[id=container]") + .requireSelector("div[id=sprflt]") + .requireSelector("div[id=main]"); + + dept.departmentName = main + .requireSelector("h1[id=title]") + .innerText; + + auto content = main.requireSelector("div[class~=content]"); + auto sections = content.childRange + .splitSectionsByHeaders; + + foreach (section, items; sections) { + if (section == "♦ ♦ ♦" || section == "") continue; + + //writefln("Section %s:", section); + foreach (item; items) { + auto text = item.innerText.strip(); + if (!text.length || matchFirst(text, ctRegex!`(\* Not offered in|\[Return to top\]|♦ ♦ ♦|Revised:[^\n]+)`)) { continue; } + //if (auto match = matchFirst(text, ctRegex!`Revised:\s+([^\n]+)`)) { + // dept.lastCourseRevisionDate = match[1]; + // continue; + //} + auto match = matchFirst(text, ctRegex!`(\w+\s+(?:\w\.\s+)?\w+)\s*([^\n]+)?`); + enforce(match, format("Could not match professor listing...? '%s'", text)); + auto name = match[1].strip(); + auto description = match[2].strip(); + + //enforce(name !in dept.faculty, format("'%s' already exists in dept.faculty", name)); + if (name in dept.faculty) { + writefln("'%s' already exists in dept.faculty!", name); + } + dept.faculty[name] = DepartmentInfo.FacultyListing( + name, section, dept.departmentId, description + ); + } + } + }); + return dept; +} diff --git a/crawlers/d-crawler/source/department_info/fetch_info.d b/crawlers/d-crawler/source/department_info/fetch_info.d new file mode 100644 index 0000000..7b6e454 --- /dev/null +++ b/crawlers/d-crawler/source/department_info/fetch_info.d @@ -0,0 +1,54 @@ +module department_info.fetch_info; +import department_info.model; +import util.fetch_html: fetchHtml; +import util.search_utils: childRange, regexMatch; +import std.stdio; +import std.regex; +import std.exception: enforce; +import std.string: strip; +import arsd.dom; + +DepartmentInfo fetchInfo (DepartmentInfo dept) { + fetchHtml(dept.programUrl, dept.error, (Document document) { + auto main = document + .requireSelector("body") + .requireSelector("div[id=wrap]") + .requireSelector("div[id=container]") + .requireSelector("div[id=sprflt]") + .requireSelector("div[id=main]"); + + dept.departmentName = main + .requireSelector("h1[id=title]") + .innerText; + + auto content = main.requireSelector("div[class~=content]"); + try { + auto sections = content.childRange + .requireSeq((child) { + return child.tagName == "p" && child.regexMatch!`(\d+\-\d+ (?:General )?Catalog)`(dept.catalogVersion); + }) + .requireSeq((child) { + //writefln("Got <%s>: %s", child.tagName, child.innerText); + if (!(child.tagName == "p" && child.innerText.strip() != "" && child.regexMatch!`(.[^\n]+)`(dept.departmentAddress))) { + return false; + } + auto match = matchFirst(child.innerText, ctRegex!(`(?:([\(\)\d\-\s]+)[\n\s]+)?(http.+)`, "g")); + enforce(match, format("Could not find contact info in '%s'", child.innerText)); + dept.departmentPhoneNumber = match[1].strip(); + dept.departmentUrl = match[2]; + return true; + }) + //.requireSeq((child) { + // return child.tagName == "hr" + //}); + //.splitSectionsByHeaders + ; + } catch (Throwable e) { + writefln("Error parsing deparment page: \u001b[31m%s\u001b[0m", e); + //writefln("\u001b[36mError parsing document.\n\u001b[31m%s\n\n"~ + // "\u001b[33mContent dump:\n%s\n\u001b[0m", + // e, content.innerHTML); + } + }); + return dept; +} diff --git a/crawlers/d-crawler/source/department_info/model.d b/crawlers/d-crawler/source/department_info/model.d new file mode 100644 index 0000000..8904210 --- /dev/null +++ b/crawlers/d-crawler/source/department_info/model.d @@ -0,0 +1,110 @@ +module department_info.model; +public import std.format: format; +import std.string; +import std.conv: to; +import std.array; +import std.algorithm; +import jsonizer; + + +class DepartmentInfo { +public: + mixin JsonizeMe; + + @jsonize { + string departmentId; + string programUrl; + string coursesUrl; + string facultyUrl; + } + + Exception error = null; + + @jsonize { + string catalogVersion; + string departmentName; + string departmentUrl; + string departmentAddress; + string departmentPhoneNumber; + string lastCourseRevisionDate; + + string rawProgramStatement; + + FacultyListing[string] faculty; + CourseListing[string] courses; + } + + struct ProgramListing { + mixin JsonizeMe; + @jsonize { + string section; + string content; + } + //string toString () { + // return format(`{ "section": "%s", "content": "%s" }`, + // section, content); + //} + } + struct FacultyListing { + mixin JsonizeMe; + @jsonize { + string name; + string title; + string department; + string description; + } + //string toString () { + // return format(`"%s": { "title": "%s", "dept": "%s", "description": "%s" }`, + // name, title, department, description); + //} + } + struct CourseListing { + mixin JsonizeMe; + @jsonize { + string name; + string title; + string division; + string terms; + string instructor; + string description; + string geCategories; + } + //string toString () { + // return format(`"%s": { "title": "%s", "division": "%s", "terms": "%s", "instructor": "%s", "description": "%s", "GE": "%s" }`, + // name, title, division, terms, instructor, description, geCategories); + //} + } + + this (string baseUrl, string departmentId) { + departmentId = departmentId.toLower; + this.departmentId = departmentId.toUpper; + this.programUrl = format("%s/program-statements/%s.html", baseUrl, departmentId); + this.coursesUrl = format("%s/course-descriptions/%s.html", baseUrl, departmentId); + this.facultyUrl = format("%s/faculty/%s.html", baseUrl, departmentId); + } + + //override string toString () { + // return format(`"%s": { + // "name": "%s", + // "courses-revision": "%s", + // "etc": { + // "program-page": "%s", + // "courses-page": "%s", + // "faculty-page": "%s", + // "homepage": "%s", + // "address": "%s", + // "phone": "%s", + // }, + // "faculty": { + // %s + // }, + // "courses": { + // %s + // } + // }`, departmentId, departmentName, catalogVersion, programUrl, coursesUrl, facultyUrl, + // departmentUrl, departmentAddress, departmentPhoneNumber, + // faculty.values.map!"a.to!string".join(", "), + // courses.values.map!"a.to!string".join(", ") + // ); + //} +} diff --git a/crawlers/d-crawler/source/department_info/package.d b/crawlers/d-crawler/source/department_info/package.d new file mode 100644 index 0000000..b363f67 --- /dev/null +++ b/crawlers/d-crawler/source/department_info/package.d @@ -0,0 +1,73 @@ +module department_info; +public import department_info.model: DepartmentInfo; +public import department_info.fetch_info: fetchInfo; +public import department_info.fetch_courses: fetchCourses; +public import department_info.fetch_faculty: fetchFaculty; +import std.format: format; + +DepartmentInfo fetchDepartment (string baseUrl, string departmentId) { + return new DepartmentInfo(baseUrl, departmentId) + .fetchInfo() + .fetchCourses() + .fetchFaculty() + ; +} + +unittest { + import utils.expect: expect; + + auto dept = fetchDepartment("https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses", "math"); + expect(dept.departmentId).toEqual("MATH"); + expect(dept.programUrl).toEqual("https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses/program-statements/math.html"); + expect(dept.coursesUrl).toEqual("https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses/course-descriptions/math.html"); + expect(dept.facultyUrl).toEqual("https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses/faculty/math.html"); + + expect(dept.error).toEqual(null); + expect(dept.catalogVersion).toEqual("2017-18 General Catalog"); + expect(dept.departmentName).toEqual("Mathematics"); + expect(dept.departmentUrl).toEqual("http://www.math.ucsc.edu"); + expect(dept.departmentAddress).toEqual("4111 McHenry"); + expect(dept.departmentPhoneNumber).toEqual("(831) 459-2969"); + + expect(dept.faculty).toContain("Richard Montgomery"); + expect(dept.faculty["Richard Montgomery"].name).toEqual("Richard Montgomery"); + expect(dept.faculty["Richard Montgomery"].title).toEqual("Professor"); + expect(dept.faculty["Richard Montgomery"].department).toEqual("MATH"); + expect(dept.faculty["Richard Montgomery"].description).toEqual( + "Celestial mechanics, differential geometry, gauge theory, mechanics (quantum and classical), and singularity theory"); + + expect(dept.faculty).toContain("Richard Montgomery"); + expect(dept.faculty["Marvin J. Greenberg"].name).toEqual("Marvin J. Greenberg"); + expect(dept.faculty["Marvin J. Greenberg"].title).toEqual("Emeriti"); + expect(dept.faculty["Marvin J. Greenberg"].department).toEqual("MATH"); + expect(dept.faculty["Marvin J. Greenberg"].description).toEqual(""); + + expect(dept.faculty).toNotContain("Daniele Venturi (Applied Math and Statistics)"); + expect(dept.faculty).toNotContain("Daniele Venturi"); + + expect(dept.courses).toContain("MATH 19B"); + expect(dept.courses["MATH 19B"].name).toEqual("MATH 19B"); + expect(dept.courses["MATH 19B"].title).toEqual("Calculus for Science, Engineering, and Mathematics"); + expect(dept.courses["MATH 19B"].division).toEqual("lower-division"); + expect(dept.courses["MATH 19B"].terms).toEqual("FWS"); + expect(dept.courses["MATH 19B"].instructor).toEqual("The Staff"); + expect(dept.courses["MATH 19B"].description).toEqual( + "The definite integral and the fundamental theorem of calculus. Areas, volumes. Integration by parts, "~ + "trigonometric substitution, and partial fractions methods. Improper integrals. Sequences, series, "~ + "absolute convergence and convergence tests. Power series, Taylor and Maclaurin series. Students "~ + "cannot receive credit for both this course and course 11B, Applied Math and Statistics 11B and 15B, "~ + "or Economics 11B. Prerequisite(s): course 19A or 20A or AP Calculus AB exam score of 4 or 5, or BC "~ + "exam score of 3 or higher, or IB Mathematics Higher Level exam score of 5 of higher"); + + expect(dept.courses).toContain("MATH 249B"); + expect(dept.courses["MATH 249B"].name).toEqual("MATH 249B"); + expect(dept.courses["MATH 249B"].title).toEqual("Mechanics II"); + expect(dept.courses["MATH 249B"].division).toEqual("graduate"); + expect(dept.courses["MATH 249B"].terms).toEqual("*"); + expect(dept.courses["MATH 249B"].instructor).toEqual("The Staff"); + expect(dept.courses["MATH 249B"].description).toEqual( + "Hamiltonian dynamics with symmetry. Key topics center around the momentum map and the theory of "~ + "reduction in both the symplectic and Poisson context. Applications are taken from geometry, rigid "~ + "body dynamics, and continuum mechanics. Course 249A is recommended as preparation. Enrollment "~ + "restricted to graduate students"); +} diff --git a/crawlers/d-crawler/source/department_info/parse_utils.d b/crawlers/d-crawler/source/department_info/parse_utils.d new file mode 100644 index 0000000..feb4591 --- /dev/null +++ b/crawlers/d-crawler/source/department_info/parse_utils.d @@ -0,0 +1,200 @@ +module department_info.parse_utils; +import department_info.model; +import utils.expect: expect; +import std.regex; +import std.string; +import std.conv: parse; +import std.stdio; + + +public string fixSentencePeriods (ref string s) { + return s = s.replaceAll(ctRegex!`\.(["\)]+)`, "$1."); +} +public string fixSentencePeriods (string s) { + return fixSentencePeriods(s); +} +unittest { + expect(fixSentencePeriods("")).toEqual(""); + expect(fixSentencePeriods(".)")).toEqual(")."); + expect(fixSentencePeriods(`."`)).toEqual(`".`); + expect(fixSentencePeriods(`.)"`)).toEqual(`)".`); + expect(fixSentencePeriods(`.")`)).toEqual(`").`); + expect(fixSentencePeriods("(Hello).")).toEqual("(Hello)."); + expect(fixSentencePeriods("Hello. (World!). \"Hello\". World")).toEqual("Hello. (World!). \"Hello\". World"); + expect(fixSentencePeriods("(Hello.) \"World.\" Hello.")).toEqual("(Hello). \"World\". Hello."); +} + +public string fixAbbreviations (ref string s) { + writefln("Attempting match... '%s'", s); + return s = replaceAll!((Captures!string match) { + writefln("Matched: '%s'", match[1]); + return match[1]; + })(s, ctRegex!(`([A-Z][a-z]*)\.($|[A-Z]|\s+[a-z])`, "g")); + + + //ctRegex!(`([A-Z][a-z]*\.)+(\s+[a-z]|\s+[a-zA-Z]+[^\.]|[A-Z][a-z]*\.(?:\s+[A-Z])|\s*$)`, "g")); +} +public string fixAbbreviations (string s) { + return fixAbbreviations(s); +} +unittest { + //writefln("Testing..."); + //fixAbbreviations("U.S. Asdf"); + //fixAbbreviations("U.S. asdf"); + //fixAbbreviations("U.S."); + //fixAbbreviations("U.S. A"); + //fixAbbreviations("U. St"); + //fixAbbreviations("U.S Ta"); + + + expect(fixAbbreviations("U.S.")).toEqual("US"); + expect(fixAbbreviations("Ph.D.")).toEqual("PhD"); + expect(fixAbbreviations("U.S. stuff. B. A")).toEqual("US stuff. B. A"); + expect(fixAbbreviations("Ph.D. fubar. Baz.")).toEqual("PhD fubar. Baz."); + expect(fixAbbreviations("Ph.D fubar. Baz.")).toEqual("PhD fubar. Baz."); + expect(fixAbbreviations("Ph.D. Fubar. Baz.")).toEqual("PhD. Fubar. Baz."); + expect(fixAbbreviations("Fubar. Bar Ph.D. fubar. Baz.")).toEqual("Fubar. Bar PhD fubar. Baz."); +} + + +public bool parseCourseNumber (DepartmentInfo context, ref string s, out string result) { + if (auto match = matchFirst(s, ctRegex!(`^(\d+[A-Z]?)\.(?:\s+|$)`))) { + result = context.departmentId ~ " " ~ match[1]; + s = match.post; + return true; + } + return false; +} + +private void expectParse + (alias f, T, string file = __FILE__, size_t line = __LINE__) + (DepartmentInfo context, bool returnValue, string input, T result, string output) +{ + T outValue; + expect!(bool, file, line)(f(context, input, outValue)).toEqual(returnValue); + expect!(string, file, line)(input).toEqual(output); + expect!(T, file, line)(outValue).toEqual(result); +} +unittest { + auto info = new DepartmentInfo("", "math"); + expect(info.departmentId).toEqual("MATH"); + expectParse!parseCourseNumber(info, false, "", "", ""); + expectParse!parseCourseNumber(info, false, " 1.", "", " 1."); + expectParse!parseCourseNumber(info, false, "M123f. asdf", "", "M123f. asdf"); + expectParse!parseCourseNumber(info, true, "1. asdf", "MATH 1", "asdf"); + expectParse!parseCourseNumber(info, true, "1234A. Fubar", "MATH 1234A", "Fubar"); + expectParse!parseCourseNumber(info, true, "1234A. Fubar", "MATH 1234A", "Fubar"); +} + +public bool parseCourseUnits (DepartmentInfo context, ref string s, out int result) { + if (auto match = matchFirst(s, ctRegex!(`\s*\((\d+) units?\)`, "g"))) { + string text = match[1]; + result = text.parse!int; + s = match.pre ~ match.post; + return true; + } + return false; +} +unittest { + auto info = new DepartmentInfo("", "math"); + expect(info.departmentId).toEqual("MATH"); + expectParse!parseCourseUnits(info, false, "", 0, ""); + expectParse!parseCourseUnits(info, true, "(1 unit)", 1, ""); + expectParse!parseCourseUnits(info, true, "(1 unit).", 1, "."); + expectParse!parseCourseUnits(info, true, "(1 unit). fubar", 1, ". fubar"); + expectParse!parseCourseUnits(info, true, "Foo (20 units). fubar", 20, "Foo. fubar"); + expectParse!parseCourseUnits(info, true, "Foo (20 units)", 20, "Foo"); + expectParse!parseCourseUnits(info, false, "Foo (20 units.)", 0, "Foo (20 units.)"); +} + +//public bool parseCourseTerm (DepartmentInfo context, ref string s, out string result) { +// if (auto match = matchFirst(s, ctRegex!`(?:^|\s+)([FWS](?:\,[FWS])*|\*)(\s+|$)`)) { +// result = match[1].replace(",",""); +// s = match.pre ~ match[2] ~ match.post; +// return true; +// } +// return false; +//} +//unittest { +// auto info = new DepartmentInfo("", "math"); +// expect(info.departmentId).toEqual("MATH"); +// expectParse!parseCourseTerm(info, false, "", "", ""); +// expectParse!parseCourseTerm(info, true, "*", "*", ""); +// expectParse!parseCourseTerm(info, true, "F", "F", ""); +// expectParse!parseCourseTerm(info, true, "F,W", "FW", ""); +// expectParse!parseCourseTerm(info, false, "FW", "", "FW"); +// expectParse!parseCourseTerm(info, false, "F,,W", "", "F,,W"); +// expectParse!parseCourseTerm(info, false, "F,WW", "", "F,WW"); +// expectParse!parseCourseTerm(info, true, "borg. F,W,S", "FWS", "borg."); +// expectParse!parseCourseTerm(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCourseTerm(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCourseTerm(info, true, "borg. * asdf", "*", "borg. asdf"); +// expectParse!parseCourseTerm(info, false, "borg.* asdf", "", "borg.* asdf"); +// expectParse!parseCourseTerm(info, false, "Spring", "", "Spring"); +// expectParse!parseCourseTerm(info, true, "F Spring", "F", " Spring"); +//} + +//public bool parseCourseTitle (DepartmentInfo context, ref string s, out string result) { +// return false; +//} +//unittest { +// auto info = new DepartmentInfo("", "math"); +// expect(info.departmentId).toEqual("MATH"); +// expectParse!parseCourseTitle(info, false, "", "", ""); +// expectParse!parseCourseTitle(info, true, "*", "*", ""); +// expectParse!parseCourseTitle(info, true, "F", "F", ""); +// expectParse!parseCourseTitle(info, true, "F,W", "FW", ""); +// expectParse!parseCourseTitle(info, false, "FW", "", "FW"); +// expectParse!parseCourseTitle(info, false, "F,,W", "", "F,,W"); +// expectParse!parseCourseTitle(info, false, "F,WW", "", "F,WW"); +// expectParse!parseCourseTitle(info, true, "borg. F,W,S", "FWS", "borg."); +// expectParse!parseCourseTitle(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCourseTitle(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCourseTitle(info, true, "borg. * asdf", "*", "borg. asdf"); +// expectParse!parseCourseTitle(info, false, "borg.* asdf", "", "borg.* asdf"); +// expectParse!parseCourseTitle(info, false, "Spring", "", "Spring"); +// expectParse!parseCourseTitle(info, true, "F Spring", "F", " Spring"); +//} +//public bool parseCourseInstructor (DepartmentInfo context, ref string s, out string result) { +// return false; +//} +//unittest { +// auto info = new DepartmentInfo("", "math"); +// expect(info.departmentId).toEqual("MATH"); +// expectParse!parseCourseInstructor(info, false, "", "", ""); +// expectParse!parseCourseInstructor(info, true, "*", "*", ""); +// expectParse!parseCourseInstructor(info, true, "F", "F", ""); +// expectParse!parseCourseInstructor(info, true, "F,W", "FW", ""); +// expectParse!parseCourseInstructor(info, false, "FW", "", "FW"); +// expectParse!parseCourseInstructor(info, false, "F,,W", "", "F,,W"); +// expectParse!parseCourseInstructor(info, false, "F,WW", "", "F,WW"); +// expectParse!parseCourseInstructor(info, true, "borg. F,W,S", "FWS", "borg."); +// expectParse!parseCourseInstructor(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCourseInstructor(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCourseInstructor(info, true, "borg. * asdf", "*", "borg. asdf"); +// expectParse!parseCourseInstructor(info, false, "borg.* asdf", "", "borg.* asdf"); +// expectParse!parseCourseInstructor(info, false, "Spring", "", "Spring"); +// expectParse!parseCourseInstructor(info, true, "F Spring", "F", " Spring"); +//} + +//public bool parseCoursePrereqs (DepartmentInfo context, ref string s, out string result) { +// return false; +//} +//unittest { +// auto info = new DepartmentInfo("", "math"); +// expect(info.departmentId).toEqual("MATH"); +// expectParse!parseCoursePrereqs(info, false, "", "", ""); +// expectParse!parseCoursePrereqs(info, true, "*", "*", ""); +// expectParse!parseCoursePrereqs(info, true, "F", "F", ""); +// expectParse!parseCoursePrereqs(info, true, "F,W", "FW", ""); +// expectParse!parseCoursePrereqs(info, false, "FW", "", "FW"); +// expectParse!parseCoursePrereqs(info, false, "F,,W", "", "F,,W"); +// expectParse!parseCoursePrereqs(info, false, "F,WW", "", "F,WW"); +// expectParse!parseCoursePrereqs(info, true, "borg. F,W,S", "FWS", "borg."); +// expectParse!parseCoursePrereqs(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCoursePrereqs(info, true, "borg. F,W,S asdf", "FWS", "borg. asdf"); +// expectParse!parseCoursePrereqs(info, true, "borg. * asdf", "*", "borg. asdf"); +// expectParse!parseCoursePrereqs(info, false, "borg.* asdf", "", "borg.* asdf"); +// expectParse!parseCoursePrereqs(info, false, "Spring", "", "Spring"); +// expectParse!parseCoursePrereqs(info, true, "F Spring", "F", " Spring"); +//} diff --git a/crawlers/d-crawler/source/util/expect.d b/crawlers/d-crawler/source/util/expect.d new file mode 100644 index 0000000..909bed4 --- /dev/null +++ b/crawlers/d-crawler/source/util/expect.d @@ -0,0 +1,43 @@ +module utils.expect; +import core.exception: AssertError; +import std.format: format; + +public auto expect (T, string file = __FILE__, size_t line = __LINE__)(T value) { + return Expectation!T(value, file, line); +} + +private struct Expectation (T) { + T value; + string file; + size_t line; + + this (T value, string file, size_t line) { + this.value = value; + this.file = file; + this.line = line; + } + void toEqual (U)(U other) { + if (value != other) { + throw new AssertError(format("expected '%s', got '%s'", + other, value), file, line); + } + } + void toNotEqual (U)(U other) { + if (value == other) { + throw new AssertError(format("expected '%s', got '%s'", + other, value), file, line); + } + } + void toContain (K)(K key) { + if (key !in value) { + throw new AssertError(format("expected item to contain key '%s': '%s')", + key, value), file, line); + } + } + void toNotContain (K)(K key) { + if (key in value) { + throw new AssertError(format("expected item not to contain key '%s': '%s')", + key, value), file, line); + } + } +} diff --git a/crawlers/d-crawler/source/util/fetch_html.d b/crawlers/d-crawler/source/util/fetch_html.d new file mode 100644 index 0000000..db82048 --- /dev/null +++ b/crawlers/d-crawler/source/util/fetch_html.d @@ -0,0 +1,28 @@ +module util.fetch_html; +import std.net.curl: get, CurlException; +import arsd.dom: Document, Element; +import std.stdio: writefln; + +bool fetchHtml (string url, ref Exception error, void delegate(Document) callback) { + if (error) { return false; } + string html = null; + Document document = null; + try { + writefln("\u001b[36mFetching %s\u001b[0m", url); + html = cast(string)get(url); + document = new Document(html); + } catch (Exception e) { + writefln("\u001b[32;1mError fetching %s:\u001b[31m\n\t%s\u001b[0m", url, e); + error = e; + return false; + } + try { + callback(document); + writefln("\u001b[32mFinished loading %s\u001b[0m", url); + } catch (Throwable e) { + writefln("\u001b[36mParsing document with url '%s' failed.\n\u001b[31m%s\n\n"~ + "\u001b[33mText Dump:\n%s\n\u001b[0m", + url, e, html); + } + return true; +} diff --git a/crawlers/d-crawler/source/util/search_utils.d b/crawlers/d-crawler/source/util/search_utils.d new file mode 100644 index 0000000..2f81344 --- /dev/null +++ b/crawlers/d-crawler/source/util/search_utils.d @@ -0,0 +1,115 @@ +module util.search_utils; +import arsd.dom; +import std.regex; +import std.exception: enforce; +import std.format: format; +import std.stdio: writefln; + +ElementRange childRange (Element elem) { + return ElementRange(elem, 0, elem.children.length); +} + +bool regexMatch (string regex, Args...)(Element elem, ref Args args) { + auto match = matchFirst(elem.innerText, ctRegex!regex); + if (!match) return false; + size_t i = 0; + foreach (ref arg; args) { + arg = match[++i]; + } + return true; +} + +struct ElementRange { + private Element head; + private size_t s, e; + + this (Element elem, size_t start, size_t stop) { + this.head = elem; + this.s = start; + this.e = stop; + } + + private bool bounded () { return s < e; } + private void assertBounded () { assert(bounded, format("%s > %s!", s, e)); } + + bool empty () { return !bounded; } + size_t length () { return e - s; } + + Element front () { assertBounded(); return head[s]; } + Element back () { assertBounded(); return head[e - 1]; } + Element moveFront () { assertBounded(); return head[s++]; } + Element moveBack () { assertBounded(); return head[--e]; } + Element opIndex (size_t i) { + assert(s + i < e, format("Out of range: %s + %s = %s > %s!", s, i, s + i, e)); + return head[s + i]; + } + void popFront () { assertBounded(); ++s; } + void popBack () { assertBounded(); --e; } + ElementRange save () { return ElementRange(head, s, e); } + + ref ElementRange requireSeq (bool delegate (Element elem) predicate) { + auto saved = save(); + //writefln("Range empty? %s", empty); + while (!empty && !predicate(front)) { + //writefln("Did not match '%s'", front); + popFront; + } + enforce(!empty, format("Failed to find sequence, starting at %s", + !saved.empty ? saved.front.innerHTML : "")); + popFront; + //writefln("Matched! %s", front); + return this; + } + + unittest { + import std.range.primitives; + static assert(isForwardRange!ElementRange); + static assert(isBidirectionalRange!ElementRange); + static assert(isRandomAccessRange!ElementRange); + } + + ref ElementRange processSectionsSplitBy ( + bool delegate (Element) headerPredicate, + void delegate (Element, ElementRange) handleSection + ) { + if (!empty) { + ElementRange section = save; + Element header = null; + for (auto it = save; true; it.popFront) { + if (it.empty || headerPredicate(it.front)) { + section.e = it.s; + handleSection(header, section); + if (!it.empty) { + header = it.front; + section.s = it.s + 1; + } else break; + } + } + } + return this; + } + ElementRange[string] splitSectionsByHeaders () { + ElementRange[string] sections; + processSectionsSplitBy( + (Element e) { return e.tagName == "h1" || e.tagName == "h2" || e.tagName == "h3"; }, + (Element header, ElementRange section) { + sections[header ? header.innerText : ""] = section.save; + } + ); + return sections; + } + string innerText () { + string text = ""; + for (size_t i = s; i < e; ++i) { + text ~= head[i].innerText; + } + return text; + } + string innerHTML () { + string text = ""; + for (size_t i = s; i < e; ++i) { + text ~= head[i].innerHTML; + } + return text; + } +} diff --git a/crawlers/ucsc/fetch_course_pages.py b/crawlers/ucsc/fetch_course_pages.py new file mode 100644 index 0000000..f0c01c2 --- /dev/null +++ b/crawlers/ucsc/fetch_course_pages.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import re +from bs4 import BeautifulSoup, Comment +from urllib.request import HTTPError +from fetch_index import fetch_soup, enforce, fetch_department_urls +import os + +def extract_text (element): + # this is REALLY f***ing ugly... + if isinstance(element, Comment): + return '' + elif element.name == 'p': + return '\n%s\n'%(u''.join(map(extract_text, element))) + elif element.name == 'div': + return '\n%s\n'%(u''.join(map(extract_text, element))) + elif element.name == 'br': + return '\n' + elif element.name == 'strong': + # This probably deserves some explaination. Ok, issues are as follows: + # – some idiot put a line break to separate stuff-that-should-be-separated in lgst. + # line break / paragraph element doesn't show up elsewhere, so we have to catch + + # address it here. + # - some other idiot put a line break in anthropology, separating a title that + # SHOULDN'T be separated + # + # So, we do the following: + # – we manually concatenate all of the inner text tags (b/c no way to do this otherwise) + # - if non-empty text is followed by a line break, we emit a '\n' afterwards + # - if not we don't, b/c there shouldn't be any good reason to put a
inside of a + # strong tag given what the registrar page is supposed to look like... + text = '' + has_non_internal_line_break = False + for child in element: + if child.name == 'br': + has_non_internal_line_break = True + elif child.name == None: + text += child + has_non_internal_line_break = False + return text + '\n' if has_non_internal_line_break else text + elif element.name is None: + return '%s'%element + elif element.name == 'comment': + raise Exception("Skipping comment %s"%element.text) + else: + return element.text + +def extract_sections (content, dept): + divisions = {} + text = '' + division = None + for child in content: + if child.name == 'h1' or child.name == 'h2' or child.name == 'h3' or child.name == 'h4': + match = re.match(r'^\s*([A-Z][a-z]+(?:\-[A-Z][a-z]+)*)\s+Courses', child.text) + enforce(match, "Expected header to be course heading, got '%s'", child.text) + if division: + divisions[division] = text + text = '' + division = match.group(1) + # print("Setting division: '%s'"%division) + elif division: + if child.name == 'p': + try: + test = child['align'] + continue + except KeyError: + pass + text += extract_text(child) + if division: + divisions[division] = text + + print("Listed Divisions: %s"%divisions.keys()) + + text = '' + + # THIS IS A TERRIBLE HACK. + # Problem: the sociology page's intro course is missing a course number. + # Solution: this. + # This will break (hopefully) whenever the sociology fixes that page. + # Until then, uh... + if dept == 'socy': + divisions['Lower-Division'] = '1. '+divisions['Lower-Division'] + + for k, v in divisions.items(): + text += '\nDIVISION %s\n%s'%(k, v) + return text + +def fetch_dept_page_content (url): + try: + soup = fetch_soup(url) + content = soup.find("div", {"class": "content"}) + text = extract_sections(content, url.split('/')[-1].split('.')[0]) + enforce(text, "Empty page content: '%s'\nRaw content:\n%s", url, content.text) + text = text.replace('\\n', '') + text = '\n'.join([ line.strip() for line in text.split('\n') ]) + return text + except HTTPError: + print("Failed to open department page '%s'"%url) + return None + +class DepartmentPageEntry: + def __init__ (self, dept, title, url, content): + self.dept = dept.strip() + self.title = title.strip() + self.url = url.strip() + self.content = content + + def __repr__ (self): + return '''[Department %s title '%s' url '%s' content (%d byte(s))'''%( + self.dept, self.title, self.url, len(self.content)) + +def fetch_department_course_pages (base_url = 'https://registrar.ucsc.edu/catalog/programs-courses', dept_urls = None): + if not dept_urls: + dept_urls = fetch_department_urls(base_url) + enforce(dept_urls, "Could not fetch department urls from index at base url '%s'", base_url) + + for title, url in dept_urls.items(): + page = url.split(u'/')[-1] + dept = page.split(u'.')[0] + url = u'%s/course-descriptions/%s'%(base_url, page) + print("Fetching '%s' => '%s'"%(title, url)) + result = fetch_dept_page_content(url) + if result: + yield DepartmentPageEntry(dept, title, url, result) + +def dump_department_pages_to_disk (path='data', base_url = 'https://registrar.ucsc.edu/catalog/programs-courses', dept_urls = None): + for dept in fetch_department_course_pages(base_url, dept_urls): + with open('%s/courses/%s'%(path, dept.dept), 'w') as f: + f.write(u'\n'.join([ + dept.dept, + dept.title, + dept.url, + dept.content + ])) + +def fetch_courses_from_disk (path='data'): + for filename in os.listdir(u'%s/courses/'%path): + with open(u'%s/courses/%s'%(path, filename), 'r') as f: + lines = f.read().split('\n') + result = DepartmentPageEntry( + lines[0], + lines[1], + lines[2], + '\n'.join(lines[3:])) + print("Loaded %s: '%s', %s byte(s)"%( + result.dept, result.title, len(result.content))) + yield result + +def fetch_course_pages (*args, **kwargs): + courses = list(fetch_courses_from_disk(*args, **kwargs)) + if not courses: + print("No disk cache; refetching") + return fetch_department_course_pages(*args, **kwargs) + return courses + + +if __name__ == '__main__': + dump_department_pages_to_disk('data') + # dept_urls = fetch_department_urls() + # print("Got %s"%dept_urls) + # for dept in fetch_department_course_pages(): + # print(dept) + # print(dept.content) + # print() \ No newline at end of file diff --git a/crawlers/ucsc/fetch_index.py b/crawlers/ucsc/fetch_index.py new file mode 100644 index 0000000..ba1cdec --- /dev/null +++ b/crawlers/ucsc/fetch_index.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import re +from urllib.request import urlopen +from bs4 import BeautifulSoup +import gzip +import io +import unicodedata + +def read_url (url): + response = urlopen(url) + return response.read() + try: + buffer = io.StringIO(response.read()) + result = gzip.GzipFile(fileobj=buffer) + return result.read().decode('utf8') + except IOError: + return response.read()#.encode('utf8') + +def fetch_soup (url): + text = str(read_url(url)) + # text = text.replace(u'\u2014', u'–') # unicode bullshit + text = text.replace('\xa0', ' ') + text = unicodedata.normalize('NFKD', text) + with open('temp', 'w') as f: + f.write(text) + return BeautifulSoup(text, 'html.parser') + +def enforce (condition, msg, *args): + if not condition: + raise Exception(msg % args) + +def parse_department_link (a): + href = a['href'] #if 'href' in a else '' + #title = a['title'] if 'title' in a else '' + match = re.match(r'program.statements/([a-z]+\.html)', href) + enforce(match, "Unexpected link url: '%s'", href) + text = a.text.strip() + if text: + return text, href + +def parse_department_links (links): + for link in links: + result = parse_department_link(link) + if result: + yield result + +def fetch_department_urls (base_url = 'https://registrar.ucsc.edu/catalog/programs-courses'): + index_url = '%s/index.html'%base_url + soup = fetch_soup(index_url) + dept_anchor = soup.find('a', id='departments') + enforce(dept_anchor, "Could not find '%s/#departments'", index_url) + header = dept_anchor.parent + enforce(header.name == "h2", "Unexpected: is not a h2 tag (got '%s')", header.name) + table = header.findNext('tr') + enforce(table.name == "tr", "Expected element after heading to be table, not '%s'", table.name) + return {k: '%s/%s'%(base_url, v) for k, v in parse_department_links(table.find_all('a'))} + +if __name__ == '__main__': + result = fetch_department_urls() + print("Found %s department(s):"%len(result)) + for k, v in result.items(): + print("%s: %s"%(k, v)) diff --git a/crawlers/ucsc/parse_course_pages.py b/crawlers/ucsc/parse_course_pages.py new file mode 100644 index 0000000..b5d867e --- /dev/null +++ b/crawlers/ucsc/parse_course_pages.py @@ -0,0 +1,183 @@ +import re +from fetch_index import fetch_soup, enforce +from fetch_course_pages import fetch_course_pages +from prereq_parser import parse_prereqs + + +class Course: + def __init__ (self, name, title, credits, term, dept, division, description): + self.name = name + self.title = title + self.credits = credits + self.term = term + self.dept = dept + self.division = division + self.description = description + +def last_tok (s): + return s.split('\n')[0] if s[0] != '\n' else '\\n%s'%(s.split('\n')[0]) + +def parse_course_title_and_credits (s): + match = re.match(r'((?:U\.\s*S\.\s*|W\.E\.B\.\s*|C\.E\.\s*|[A-Z@"]|[234]D\s+|A (?:Li|Su)|I C)[\w/\-,:\(\)\\\'\d–!/?]+(?:(?:\.\.\.|[ \t]+)(?:Vol\.|vs\.|4\.5|Ph\.D\.|U\.S\.|(?:B\.)?C\.E\.?|C\.|A\.|c\.\s+1|[&\+\(\)\w/\-,:!\d–"\\\'!/?])+)*)(?:\s+\((\d+) credits?\))?\.*[ \t]*', s) + enforce(match, "Expected course title + credit(s), got '%s'"%last_tok(s)) + s = s[match.end():] + title = match.group(1).strip() + credits = match.group(2) + credits = int(credits) if credits else -1 + return s, title, credits + +def parse_course_term (s): + match = re.match(r'([FWS](?:,[FWS])*|\*+)\n', s) + fallback = re.match(r'\n', s) if not match else None + enforce(match or fallback, "Expected course term, got '%s'"%last_tok(s)) + if match: + return s[match.end():], match.group(1) + else: + return s[fallback.end():], None + +def parse_course_description (s): + match = re.match(r'\n*([A-Z"][^\n]+)(?:\n+|$)', s) + fallback = re.match(r'\n+', s) + enforce(match or fallback, "Expected course description, got '%s'"%last_tok(s)) + if match: + return s[match.end():], match.group(1) + else: + return s[fallback.end():], None + +def parse_instructor_from_description (s): + if not s: + return s, None + match = re.search(r'\s*((\([FWS]\)|The Staff|[A-Z]r?\.|[A-Z][a-z]+(?:.[A-Z])?|m. cardenas)(?:(?:,?\s+)(The Staff|[A-Z]r?\.|[A-Z][a-z]+(?:.[A-Z])?|\([FWS]\)|m. cardenas))*),?\s*$', s) + enforce(match, "Expected instructor at end of course description, got '%s'", s) + return match.group(1), s[:match.start()] + +# def parse_prereqs (prereqs): +# with open('prereqs', 'a') as f: +# f.write(prereqs+'\n') +# return + +# if 'enrollment' in prereqs: +# print("Enrollment restriction: '%s'"%prereqs) + +# elif ';' in prereqs: +# match = re.match(r'(.+;\s+)+(and|or)?\s+(.+)', prereqs) +# enforce(match, "Could not match ';' case in '%s'"%prereqs) +# elif ',' in prereqs: +# match = re.match(r'(.+,\s+)+(and|or)?\s+(.+)', prereqs) +# enforce(match, "Could not match ',' case in '%s'"%prereqs) + +def parse_prereqs_from_description (s, dept, dept_lookup): + if not s: + return None + match = re.search(r'Prerequisite\(?s\)?:\s+([^\.]+)', s) + if not match: + # print("No prereqs! in '%s'"%s) + return None + prereqs = match.group(1) + with open('prereqs', 'a') as f: + f.write(prereqs+'\n') + return parse_prereqs(prereqs, dept=dept, depts=dept_lookup) + +def parse_course (s, dept=None, division=None, dept_lookup=None): + match = re.match(r'[\n\s]*(\d+[A-Z]?)\.\s+', s) + if not match: + return s, None + s = s[match.end():] + name = '%s %s'%(dept.upper(), match.group(1)) + s, title, credits = parse_course_title_and_credits(s) + s, term = parse_course_term(s) + s, description = parse_course_description(s) + # print("Got course %s '%s', %s credit(s), %s"%(name, title, credits, term)) + # print("Description: '%s'"%description) + + # print("COURSE: %s"%name) + # print("INITIAL: %s"%description) + + instructor, description = parse_instructor_from_description(description) + prereqs = parse_prereqs_from_description(description, dept, dept_lookup) + + + # print("INSTRUCTOR: %s"%instructor) + # print("DESCRIPTION: %s"%description) + # print() + # print("=> instructor(s) '%s', description '%s'"%(instructor, description)) + return s, Course(name, title, credits, term, dept, division, description) + +def parse_division (s, dept=None, dept_lookup=None): + match = re.match(r'[\n\s]*DIVISION\s+([A-Z][a-z]+(?:\-[A-Z][a-z]+)*)\s*\n', s) + fallback = re.match(r'\* Not|<|Students submit petition to sponsoring agency\. May be repeated for credit\. The Staff|\[Return to top\]', s) if not match else None + enforce(match or fallback, "Expected 'DIVISION
\\n', not\n%s"%last_tok(s)) + if not match: + return '', [] + division = match.group(1) + s = s[match.end():] + # print("got DIVISION: '%s' in dept '%s'"%(division, dept)) + + courses = [] + while s: + s, result = parse_course(s, dept=dept, division=division, dept_lookup=dept_lookup) + if result: + courses.append(result) + else: + break + return s, courses + +def parse_course_page (page, dept_lookup): + text = page.content + courses = [] + while text: + text, result = parse_division(text, dept=page.dept, dept_lookup=dept_lookup) + # if result: + # print("Parsed %s courses from %s (%s)"%(len(result), page.dept, result[0].division)) + courses += result + return courses + +def fixup_course_lookup (lookup): + lookup['Chemistry'] = lookup['Chemistry and Biochemistry'] + +def parse_course_pages (*args, **kwargs): + pages = list(fetch_course_pages(*args, **kwargs)) + dept_lookup = {} + for page in pages: + dept_lookup[page.title] = page.dept + fixup_course_lookup(dept_lookup) + + # print("Dept lookup:") + items = sorted(list(dept_lookup.items()), key=lambda x: x[0]) + for i, (title, dept) in enumerate(items): + print("\t%d\t'%s': '%s'"%(i, title, dept)) + + for page in pages: + for result in parse_course_page(page, dept_lookup=dept_lookup): + yield result + +if __name__ == '__main__': + with open('prereqs', 'w') as f: + f.write('') + with open('unparsed', 'w') as f: + f.write('') + + courses = list(parse_course_pages()) + # print("Parsed %s courses"%len(courses)) + + byDept = {} + byDiv = {} + for course in courses: + if not course.dept in byDept: + byDept[course.dept] = [] + if not course.division in byDiv: + byDiv[course.division] = [] + byDept[course.dept].append(course) + byDiv[course.division].append(course) + + # print("Courses by department:") + # for dept, courses in byDept.items(): + # print("\t%s: %s course(s)"%(dept, len(courses))) + + # print("Courses by division:") + # for div, courses in byDiv.items(): + # print("\t%s: %s course(s)"%(div, len(courses))) + + + # print(fetch_course_pages()) + # map(parse_course_page, fetch_course_pages()) diff --git a/crawlers/ucsc/prereq_parser.py b/crawlers/ucsc/prereq_parser.py new file mode 100644 index 0000000..b38bbbe --- /dev/null +++ b/crawlers/ucsc/prereq_parser.py @@ -0,0 +1,285 @@ +import re + +def unspacify (s): + return ''.join([ w.strip() for w in s.strip().split('\n') ]) + +tokenizer = re.compile(unspacify(r''' + ([Cc]ourses?)| + (:| + \s+in\s+|\s+a\s+|\s+from\s+|\s+is\s+|\s+for\s+| + (?:[,;]?\s+(?:and\s+)?)?[Ss]atisfaction\s+of\s+(?:the\s+)?Entry\s+Level\s+Writing(?:\s+and\s+Composition)?(?:\s+[Rr]equirements?)?| + (?:pass\s+)?swimming(?:\s+ability|\s+skills\s+tests?\s+and\s+medical\s+clearance)| + (?:graduate|upper).+standing| + open\s+to\s+graduate\s+students| + undergrads.+instructor| + restricted.+students| + completion.+requirements?| + enrollment.+members?| + enrolled.+meeting| + score.+MPE\)| + score.+higher| + score.+of\s+\d+| + equivalent| + skills| + math.+background| + an\s+Undergraduate\s+Research\s+Contract.+department| + the\s+following:| + submission.+process| + proposal.+supervise| + approval.+major| + approval.+preceptor| + College.+Writing| + Completion| + satisfaction.+requirements?| + permission.+department| + intro.+classroom| + for\s+an\s+understanding.+program| + acceptance.+program| + skill.+test| + satisfaction.+requirement| + in\s+the.+Program| + (?:completing|enrollment).+instructor| + basic.+assumed| + consent.+(?:instructor|coordinator)| + is.+course| + prior.+enrollment\s+in| + highly.+preparation| + essay.+life| + intro.+tion| + by.+coordinator| + college.+approval|approval.+college| + suggested| + college-level.+coursework| + students.+instructor| + previous.+enrollment\s+in| + (?:is\s+restricted\s+to\s+)(?:feminist|psychology).+majors| + (?:a\s+)score.+(?:higher|MPE\))| + selection.+work| + enrollment.+interview| + high\s+school.+recommended| + basic\s+college| + in\s+ocean.+recommended| + no.+quarter| + core| + university.+biology| + operational.+language| + interview.+meeting| + must.+C\+\+| + introductory\s+statistics\s+course\s+\(.+ent\)| + \(|\)| + research.+department| + (?:or\s+)?(?:by\s+)?permission.+instructor| + interview.+project| + upper.+(?:recommended|supervise)| + sumbission.+process| + prior.+major| + placement\s+by\s+examination| + at\s+least\s+one\s+astronomy\s+course| + \(or\s+equivalent\)| + high-school\s+level\s+chemistry| + pass(?:\s+in)?Swimming\s+Level\s+I+\s+course.+skills| + (?:in\s+)freestyle.+breaststroke| + (?:by\s+)?(?:consent|permission)?(?:\s+of(?:\s+the)?\s+instructor)?| + instructor\s+determin(?:ation|es\s+skill\s+level)\s+at\s+first\s+class\s+meeting| + [Bb]asic\s+knowledge\s+of\s+computer\s+programming\s+languages\s+is\s+assumed| + basic\s+rowing| + more\s+hours\s+of\s+club\s+keelboat\s+useage| + advancement.+agency| + (?:instructor ?)determination\s+at\s+first\s+class\s+meeting| + a\s+writing.+meeting| + intended.+only| + mathematics\s+placement.+higher| + interview.+materials| + students.+agency| + pass.+skills| + interview.+preparedness| + work.+interview| + (?:a\s+)proposal.+supervise| + instructor.+permission| + open\s+only\s+Press| + instructor.+level| + certification.+clearance| + special.+instructor| + completion.+LA| + interview.+only| + excellent.+courses| + enrollment.+majors| + instructor.+required| + for.+perission(?:.+enroll)?| + or\s+.+equivalent| + enroll.+seniors| + concurrent.+enrollment| + basic.+Fortran| + calculus.+algebra| + instructor.+approval| + A\s+background.+programming| + satisfactory.+exam| + must.+(?:book|skills)| + priority.+concentration| + another\s+screenwriting\s+course| + petition.+concentration\)?| + history.+seminar| + (?:one\s+year|years\s+of).+language| + qualifications.+meeting| + equivalent\s+skills| + interview.+portfolio| + (?:(?:a.+)?placement|AWPE).+score\s+of\s+\d+| + taking.+recommended| + approval\s+of\s+the\s+Writing\s+Program| + [Pp]revious(?:\s+course\s+in\s+ocean\s+sciences)?| + Basic\s+Scuba\s+Certification| + Scuba| + in\s+Oakes| + approval.+provost| + current.+leader| + (?:a\s+)score\s+of\s+.+\(MPE\)| + (?:one|two)\s+upper-division\s+history\s+courses| + journalism\s+experience| + (?:the\s+)?equivalent| + essay.+member| + a\s+proposal.+supervise| + (?:determination|admission|audition).+meeting| + placement\s+by\s+interview| + proficiency\s+in\s+French| + participation.+ACE| + good\s+academic\s+standing| + pass.+swimming| + AP.+(?:higher|\d+)| + one.+studies| + enrollment\s+in| + is\s+required| + open.+Press| + freestyle.+breaststroke| + certification.+Program| + consent.+instructor| + Successful| + the.+Program| + satisfaction.+requirements| + one.+additional.+course| + required|experience| + must.+concurrently| + are.+recommended| + an.+department| + \s+any\s+| + of.+the.+following| + permission.+department| + Entry.+requirements| + successful.+core| + at\s+least.+cour?ses| + score.+\(MPE\)| + of| + score\s+of\s+\d+or\s+higher\s+on\s+the\s+mathematics\s+placement\s+examination\s+\(MPE\)| + (?:is\s+)?(?:are\s+)?(?:strongly\s+)?recommended(?:\s+as\s+preparation)?| + (?:is\s+)?[Rr]equire(?:d|ment) + (?:enrollment.+|is.+restricted.+)?(?:seniors?|upper-division|graduate(?:\s+students?))(?:.+standing)?| + higher|requirements|university.level.+biology|as.preparation|preferred|\(|\)|previous.or.concurrent.enrollment.in|ocean|[Ee]arth| + intro.+tion| + with.+adviser| + highly.+this.course| + prior.+this.course| + sub.+supervise| + work.+enroll| + to.enroll| + sciences.is.+recommended| + non.sculpture.+studios.from| + non.print.+studios.from| + non.painting.+studios.from| + non.photography.+studios.from| + from:| + per.+permission| + probability.+background| + basic.+systems| + qualifications.+inquire.+office| + or.by.permission.+instructor| + familiarity.+C\+\+| + exceptions.+instructor| + computer.+elective| + intro.CAL.+classroom| + an.understanding.+program| + grade.+better.in| + are.required| + per.+permission| + exception.+instructor| + restricted.+majors| + intro.+tion| + restricted.+seniors| + psychology.+majors| + upper.+course| + as.+course| + a.university.level.+instructor| + as.prereq.+course| + knowledge.+language| + engagement.+research| + petition.+agency| + proof.+writing| + see.+information| + admission.+audition| + strong.+recommended| + application.+letter| + folklore.+recommended| + sponsoring.+approval| + advancement.to.candidacy| + instructoazr| + for.+majors| + a.+recommended| + at.+language.+equivalent| + knowledge.+language| + instructor| + petition.+agency| + preparation| + at.+following:| + determination.+application;| + a.college.level.calculus.course| + intro.Spanish.+Examination| + )| + (\s+)| + ((?:[A-Z][a-z]+(?:\s+and)?[\s/]+)*[A-Z][a-z]+|[A-Z]+)| + (\d+[A-Z]?(?:[/-][A-Z])*)| + ([;,]\s*(?:and|or)?)| + (and|or)| + ([Oo]ne|[Tt]wo)| + (concurrent\s+enrollment\s+in)| + (required)| + (either)| + (.+) +'''), re.DOTALL | re.VERBOSE) + +assert(re.match(tokenizer, 'satisfaction of the Entry Level Writing and Composition requirements')) +assert(re.match(tokenizer, 'permission of instructor')) +assert(re.match(tokenizer, 'permission of the instructor')) + + +def parse_prereqs (prereqs, dept, depts): + # print("Parsing '%s'"%prereqs) + depts['course'] = dept + depts['courses'] = dept + course_prefix = "N/A " + for match in re.finditer(tokenizer, prereqs): + (course_keyword, + ignore, + whitespace, + course, number, + delims, + and_or, + one_from, + concurrent, + required, + either, + error + ) = match.groups() + if error: + with open ('unparsed', 'a') as f: + f.write(error+'\n') + print("unparsed: '%s'"%error) + # raise Exception("unmatched token(s) '%s' in '%s'"%(error, prereqs)) + elif course: + course = course.strip() + try: + course_prefix = '%s '%depts[course].upper() + except KeyError: + pass + # print("Unhandled course: '%s'"%course) + elif number: + pass + # print(course_prefix+number) + diff --git a/crawlers/ucsc/ucsc_registrar_crawler.py b/crawlers/ucsc/ucsc_registrar_crawler.py new file mode 100644 index 0000000..50f6f85 --- /dev/null +++ b/crawlers/ucsc/ucsc_registrar_crawler.py @@ -0,0 +1,68 @@ +from bs4 import BeautifulSoup +from urllib2 import urlopen +from pprint import pprint +import re + +def fetch_html (url, process_callback): + response = urlopen(url) + return process_callback(BeautifulSoup(response.read(), 'html.parser')) + + +def enforce (condition, msg, *args): + if not condition: + raise Exception(msg % args) + + +def process_registrar_page_content (url, callback): + def process (soup): + top = soup.find(id='top') + enforce(top, "Could not find 'top' element in page at '%s':%s", + url, soup.prettify()) + content = top.parent.parent + enforce('content' in content['class'], + "Expected #top to be nested within , not\n%s", + content.prettify() if content else '', soup.prettify()) + return callback(content) + return fetch_html(url, process) + +def filterMapRegex (items, regex, groups = (1)): + for item in items: + match = re.match(regex, item) + if match: + yield match.group(*groups) + +def process_registrar_course_page (dept): + dept = dept.upper() + prefix = dept + ' ' + courses = {} + def parse_course (name, text): + items = text.split('.') + courses[name] = { 'dept': dept } + if len(items) > 0: + courses[name]['title'] = items[0] + items = items[1:] + if len(items) > 0: + match = re.match(r'\s*([FWS](?:,[FWS])*|\*)\s+', items[0]) + enforce(match, "Could not match terms in '%s'", items[0]) + courses[name]['terms'] = match.group(1).replace(',','') + courses[name]['instructor'] = items[-1] + items = items[:-1] + if len(items) > 0: + courses[name]['description'] = '.'.join(items) + + def process (content): + text = content.text + text = re.sub(r'\.([\)"]+)', r'\1.', text) + items = filterMapRegex(text.split('\n'), + r'(\d+[A-Z]?)\.\s+([^\n]+)', (1, 2)) + for courseId, rest in items: + parse_course(prefix + courseId, rest) + return courses + return process + +if __name__ == '__main__': + result = process_registrar_page_content( + 'https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses/course-descriptions/math.html', + process_registrar_course_page('math')) + + pprint(result) diff --git a/crawlers/ucsc_old/scrapy.cfg b/crawlers/ucsc_old/scrapy.cfg new file mode 100644 index 0000000..7bed386 --- /dev/null +++ b/crawlers/ucsc_old/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = ucsc.settings + +[deploy] +#url = http://localhost:6800/ +project = ucsc diff --git a/crawlers/ucsc_old/ucsc/__init__.py b/crawlers/ucsc_old/ucsc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawlers/ucsc_old/ucsc/architecture.py b/crawlers/ucsc_old/ucsc/architecture.py new file mode 100644 index 0000000..a5f952c --- /dev/null +++ b/crawlers/ucsc_old/ucsc/architecture.py @@ -0,0 +1,200 @@ +import scrapy +import re +''' Crappy initial implementation. Minimum required to run / pass. Can add useful features later. ''' + + +class BaseCrawler: + pass + +class SelectorWrapper: + def __init__ (self, value): + self.value = value + + def xpath_require_one (self, selection): + if self.value: + result = self.value.xpath(selection) + if result is None or len(result) > 1: + raise Exception("Expected single selection with '%s', got '%s', prev selection:\n%s"%( + selection, result, self.value.extract())) + return SelectorWrapper(result) + return self + + def xpath_require_many (self, selection): + if self.value: + result = self.value.xpath(selection) + if result is None: + raise Exception("Expected 1+ selection(s) with '%s', got '%s', prev selection:\n%s"%( + selection, result, self.value.extract())) + return SelectorWrapper(result) + return self + + def map_async (self, callback): + if not self.value: + callback(self) + else: + for entry in self.value: + callback(SelectorWrapper(entry)) + + def xpath_stripped_text (self, selection=None, strip=None): + if self.value: + selection = '%s/text()'%selection if selection else 'text()' + + result = self.value.xpath(selection) + result = result.extract() if result else result + if result is None:# or len(result) != 1: + raise Exception("Expected text(), in selection '%s', got '%s' in:\n%s"%( + selection, result, self.value.extract())) + return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip()) + return self + + def xpath_attrib (self, selection, strip=None): + if self.value: + result = self.value.xpath(selection) + result = result.extract() if result else result + if result is None or len(result) != 1: + raise Exception("Expected attrib '%s', got '%s' in:\n%s"%( + selection, result, self.value.extract())) + return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip()) + return self + + + + def bind (self, result, attrib): + if self.value: + value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \ + else self.value.extract()[0] + if type(attrib) == str or type(attrib) == unicode: + result[attrib] = self.value + elif type(attrib) == tuple: + for k in attrib: + result[k] = self.value + else: + raise Exception("Invalid argument passed to %s.bind(): %s %s"%( + type(self), type(attrib), attrib)) + else: + result[attrib] = None + print("Failed to assign attrib '%s' to %s in %s"%( + attrib, type(result[attrib]), type(result))) + + def equals (self, other): + # if (type(self.value) == str or type(self.value) == unicode) == (type(other) == str or type(other) == unicode): + # pass + # if type(self.value) != type(other): + # raise Exception("%s.equals() attempting to compare conflicting types: %s and %s"%( + # type(self), type(self.value), type(other))) + return self.value == other + + def matches_re (self, regex): + if not self.value: + raise Exception("Attempting to do regex match on null result") + + if type(self.value) == str or type(self.value) == unicode: + return re.match(regex, self.value) is not None + return self.value.re(regex) is not None + + def contains (self, other): + if type(self.value) == str or type(self.value) == unicode: + return other in self.value + return self.value.contains(other) + + def bind_re (self, regex, result, attrib): + if self.value: + try: + value = self.value.extract()[0] + except AttributeError: + value = self.value + # value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \ + # else self.value.extract()[0] + + match = re.match(regex, self.value) + if not match: + raise Exception("Failed to match regex '%s' against input %s"%( + match, value)) + + if type(attrib) == str or type(attrib) == unicode: + result[attrib] = match.group(1) + elif type(attrib) == tuple: + for i, k in enumerate(attrib): + result[k] = match.group(i+1) + else: + raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%( + type(self), type(attrib), attrib)) + else: + result[attrib] = None + print("Failed to assign attrib '%s' to %s in %s"%( + attrib, type(result[attrib]), type(result))) + + def bind_re_map (self, regex, result, attrib, transform): + if self.value: + value = self.value if type(self.value) == str or type(self.value) == int or type(self.value) == unicode \ + else self.value.extract()[0] + + match = re.match(regex, value) + if not match: + raise Exception("Failed to match regex '%s' against input %s"%( + regex, value)) + + if type(attrib) == str: + result[attrib] = transform(match.group(1)) + elif type(attrib) == tuple: + for i, (k, f) in enumerate(zip(attrib, transform)): + result[k] = f(match.group(i+1)) + else: + raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%( + type(self), type(attrib), attrib)) + else: + result[attrib] = None + print("Failed to assign attrib '%s' to %s in %s"%( + attrib, type(result[attrib]), type(result))) + + def to_int (self): + if self.value: + return SelectorWrapper(int(self.value)) + return self + + def request_async_crawl (self, crawler=None, url=None): + assert(crawler is not None and url is not None) + + + def map_sequential_cases (self, selection=None, check='maybe', cases=None): + assert(check in set(('yes', 'no', 'maybe'))) + assert(cases is not None) + assert(type(cases) == tuple) + assert(type(cases[0]) == tuple) + assert(type(cases[0][0]) == str) + + do_check = check != 'no' + if not self.value: + for req, test, applicator in cases: + applicator(self) + else: + results = self.value.xpath(selection) if selection else self.value + i = 0 + for item in results: + result = SelectorWrapper(item) + if i > len(cases): + print("Too few items to match all cases") + return + if do_check and not cases[i][1](result): + if cases[i][0] == 'required': + raise Exception("Failed map_sequential_cases case test (%d):\n%s"%( + i, result)) + else: + cases[i][2](result) + i += 1 + if i < len(cases): + print("Did not visit all items") + + +def item_producer (Item): + def decorator (fcn): + def wrapper (self, request): + result = Item() + fcn(self, request, result) + return wrapper + return decorator + +def parser_entrypoint (fcn): + def wrapper (self, request): + return fcn(self, SelectorWrapper(request)) + return wrapper diff --git a/crawlers/ucsc_old/ucsc/items.py b/crawlers/ucsc_old/ucsc/items.py new file mode 100644 index 0000000..f06eff9 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/items.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy +from scrapy.item import Item, Field + +class ProgramStatementItem(scrapy.Item): # not very important + url = Field() + title = Field() + program_statement = Field() + raw_page_content = Field() + +class CourseDescriptionItem(scrapy.Item): # VERY IMPORTANT + url = Field() + dept = Field() + dept_title = Field() + course_numnber = Field() + course_title = Field() + quarters_offered = Field() + course_description = Field() + +class FacultyItem(scrapy.Item): # would be nice to have + url = Field() + name = Field() # before ',' (recommend .split(',')[0]) + title = Field() # everything after ',' (recommend .split(',')[1:]) + statement = Field() # optional + + +class PisaIndexItem(scrapy.Item): + """ Encapsulates all the data visible from a Pisa course listing on pisa.ucsc.edu/class_search/index.php """ + url = Field() # url of class page, eg. "https://pisa.ucsc.edu/class_search/index.php/index.php?action=detail&class_data=YToyOntzOjU6IjpTVFJNIjtzOjQ6IjIxODgiO3M6MTA6IjpDTEFTU19OQlIiO3M6NToiMjE3MjMiO30%3D" + course_name = Field() # string, eg. "AMS 03" + course_title = Field() # string, eg. "Precalculus" + course_section = Field() # string, eg. "01" + class_number = Field() # int, eg. 21723 + instructor = Field() # string, eg. "Garaud,P." + class_type = Field() # "LEC", "LAB", or "SEM" (or "DISC"...?) + location = Field() # string, eg. "Soc Sci 2 075" + meet_times = Field() # string, eg. "MWF 10:40AM-11:45AM" + enroll_max = Field() # int + enroll_current = Field() # int + materials_url = Field() # link to materials page, eg. "http://ucsc.verbacompare.com/comparison?id=FL18__AMS__003__01" + term = Field() # TBD, eg. "Fall 2018" + term_id = Field() # TBD, integer id used when searching via form + + +class PisaCourseItem(scrapy.Item): + """ Encapsulates all the data visible from a class page; TBD """ + url = Field() # url of class page, eg. "https://pisa.ucsc.edu/class_search/index.php/index.php?action=detail&class_data=YToyOntzOjU6IjpTVFJNIjtzOjQ6IjIxODgiO3M6MTA6IjpDTEFTU19OQlIiO3M6NToiMjE3MjMiO30%3D" + course_name = Field() # string, eg. "AMS 03" + course_title = Field() # string, eg. "Precalculus" + course_section = Field() # string, eg. "01" + class_number = Field() # int, eg. 21723 + lecture_number = Field() # int, class_number of lecture component (or class_number) + instructor = Field() # string, eg. "Garaud,P." + class_type = Field() # "LEC", "LAB", or "SEM" (or "DISC"...?) + class_type_pretty = Field() # "Lecture", ... + location = Field() # string, eg. "Soc Sci 2 075" + meet_times = Field() # string, eg. "MWF 10:40AM-11:45AM" + enroll_max = Field() # int + enroll_current = Field() # int + materials_url = Field() # link to materials page, eg. "http://ucsc.verbacompare.com/comparison?id=FL18__AMS__003__01" + term = Field() # eg. "Fall 2018" + term_id = Field() # integer id used when searching via form + career_type = Field() + grading_options = Field() + credits = Field() + gen_ed_categories = Field() + waitlist_max = Field() + waitlist_current = Field() + + course_description = Field() # Description text + enrollment_reqs = Field() # Enrollment text + class_notes = Field() # Class notes text + class_dates = Field() diff --git a/crawlers/ucsc_old/ucsc/middlewares.py b/crawlers/ucsc_old/ucsc/middlewares.py new file mode 100644 index 0000000..8697293 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class UcscSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class UcscDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/crawlers/ucsc_old/ucsc/pipelines.py b/crawlers/ucsc_old/ucsc/pipelines.py new file mode 100644 index 0000000..2208d67 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class UcscPipeline(object): + def process_item(self, item, spider): + return item diff --git a/crawlers/ucsc_old/ucsc/settings.py b/crawlers/ucsc_old/ucsc/settings.py new file mode 100644 index 0000000..0a18688 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/settings.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for ucsc project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'ucsc' + +SPIDER_MODULES = ['ucsc.spiders'] +NEWSPIDER_MODULE = 'ucsc.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'ucsc (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'ucsc.middlewares.UcscSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'ucsc.middlewares.UcscDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'ucsc.pipelines.UcscPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +AUTOTHROTTLE_ENABLED = True +# The initial download delay +AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +AUTOTHROTTLE_DEBUG = False +#CONCURRENT_REQUESTS_PER_IP +CONCURRENT_REQUESTS_PER_IP = 5 + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/crawlers/ucsc_old/ucsc/spiders/__init__.py b/crawlers/ucsc_old/ucsc/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/crawlers/ucsc_old/ucsc/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/crawlers/ucsc_old/ucsc/spiders/pisa.py b/crawlers/ucsc_old/ucsc/spiders/pisa.py new file mode 100644 index 0000000..714feec --- /dev/null +++ b/crawlers/ucsc_old/ucsc/spiders/pisa.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +import scrapy +import logging +import re +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule +from ucsc.items import PisaIndexItem, PisaCourseItem + +site_path = lambda path: '{}/{}'.format( + 'https://pisa.ucsc.edu/class_search', path) + +def parse_course_title (text, result): + assert(text) + match = re.match(r'\s*(\w+\s+\d+[A-Z]?)[^\d]+(\d+)[^\w]+([^\n]+)', text) + if not match: + raise Exception("Failed to parse '%s'"%text) + result['course_name'] = match.group(1) + result['course_section'] = match.group(2) + result['course_title'] = match.group(3).strip() + +class PisaSpider(scrapy.Spider): + name = 'pisa' + allowed_domains = ['pisa.ucsc.edu'] + search_url = site_path('index.php') + start_urls = [ search_url ] + + def __init__(self, *args, **kwargs): + logger = logging.getLogger('scrapy.spidermiddlewares.httperror') + logger.setLevel(logging.WARNING) + super(PisaSpider, self).__init__(*args, **kwargs) + + self.max_index_scrapes = -1 + self.max_page_scrapes = -1 + self.pages_total = 0 + self.pages_done = 0 + + def parse(self, response): + yield scrapy.FormRequest(url=self.search_url, + formdata={'action':'results', + 'binds[:term]':'2188', + 'binds[:reg_status]':'all', + 'binds[:subject]':'', + 'binds[:catalog_nbr_op]':'=''', + 'binds[:catalog_nbr]':'', + 'binds[:title]':'', + 'binds[:instr_name_op]':'=''', + 'binds[:instructor]':'', + 'binds[:ge]':'', + 'binds[:crse_units_op]':'=''', + 'binds[:crse_units_from]':'', + 'binds[:crse_units_to]':'', + 'binds[:crse_units_exact]':'', + 'binds[:days]':'', + 'binds[:times]':'', + 'binds[:acad_career]':'', + 'binds[:session_code]':'', + 'rec_start': '0', + 'rec_dur': '1582'}, + callback=self.parse_course_index) + + def parse_course_index(self, response): + if self.max_index_scrapes == 0: + return + + print("Parsing index '%s'"%response.url) + items = response.xpath('body/div[contains(@class,"center-block")]/div[@class="panel-body"]/div[contains(@id,"rowpanel")]') + assert(items) + for item in items: + if self.max_index_scrapes == 0: + return + self.max_index_scrapes -= 1 + + result = PisaIndexItem() + anchor = item.xpath('div[contains(@class,"panel-heading")]/h2/a[contains(@id,"class_id_")]') + assert(anchor) + result['url'] = site_path(anchor.xpath('@href').extract()[0]) + + # Temporarily disabled; this IS valid index data + # + # parse course name, title, section + # parse_course_title(anchor.xpath('text()').extract()[0], result) + + # grab class number + enrollment info + # rest = item.xpath('div[contains(@class,"panel-body")]/div[contains(@class,"row")]') + # assert(rest) + # result['class_number'] = int(rest.xpath('div[1]/a/text()').extract()[0]) + # result['instructor'] = rest.xpath('div[2]/text()').extract()[0].strip() + # location_info = rest.xpath('div[3]/text()') + # result['class_type'], result['location'] = location_info.re(r'\s*([A-Z]+):\s+([\s\w]+)') + + # result['meet_times'] = rest.xpath('div[4]/text()').extract()[0].strip() + # enroll_info = rest.xpath('div[5]/text()') + # result['enroll_current'], result['enroll_max'] = map(int, enroll_info.re(r'\s*(\d+)\s+of\s+(\d+)')) + # result['materials_url'] = rest.xpath('div[6]/a/@href').extract()[0] + + # yield result + # print("Sending crawl request for '%s'"%result['url']) + if self.max_page_scrapes != 0: + self.max_page_scrapes -= 1 + yield scrapy.Request(result['url'], callback=self.parse_course_page) + self.pages_total += 1 + print("%d / %d"%(self.pages_done, self.pages_total)) + + + def parse_course_page(self, response): + result = PisaCourseItem() + content = response.xpath('body/div[contains(@class,"panel")]/div[contains(@class,"panel-body")]') + assert(content) + + parse_course_title(content.xpath('div[1]/div/h2/text()').extract()[0], result) + result['term'] = content.xpath('div[2]/div/text()').extract()[0].strip() + + def parse_panel_class_details (panel_body): + details = panel_body.xpath('div[contains(@class,"row")]') + left_panel, right_panel = details.xpath('div[1]/dl'), details.xpath('div[2]/dl') + result['career_type'] = left_panel.xpath('dd[1]/text()').extract()[0].strip('"') + result['grading_options'] = left_panel.xpath('dd[2]/text()').extract()[0].strip('"') + result['class_number'] = int(left_panel.xpath('dd[3]/text()').extract()[0].strip('"')) + result['lecture_number'] = result['class_number'] + class_type = left_panel.xpath('dd[4]/text()').extract()[0].strip('"') + try: + result['class_type'] = { + 'Lecture': 'LEC', + 'Discussion': 'DISC', + 'Seminar': 'SEM', + 'Laboratory': 'LAB', + 'Field Studies': 'FLD', + 'Studio': 'fixme (Studio)', + }[class_type] + except KeyError: + print("FIXME unhandled class type: '%s'"%class_type) + # raise Exception("Unhandled class_type: '%s'"%class_type) + result['credits'] = left_panel.xpath('dd[5]/text()').extract()[0].strip('"') + result['gen_ed_categories'] = left_panel.xpath('dd[5]/text()').extract()[0].strip('"') + avail_seats = int(right_panel.xpath('dd[2]/text()').extract()[0].strip('"')) + result['enroll_max'] = int(right_panel.xpath('dd[3]/text()').extract()[0].strip('"')) + result['enroll_current'] = int(right_panel.xpath('dd[4]/text()').extract()[0].strip('"')) + result['waitlist_max'] = int(right_panel.xpath('dd[5]/text()').extract()[0].strip('"')) + result['waitlist_current'] = int(right_panel.xpath('dd[6]/text()').extract()[0].strip('"')) + # assert(avail_seats == result['enroll_max'] - result['enroll_current']) + + def parse_panel_description (panel_body): + result['course_description'] = panel_body.xpath('text()').extract()[0].strip() + + def parse_panel_enrollment_reqs (panel_body): + result['enrollment_reqs'] = panel_body.xpath('text()').extract()[0].strip() + + def parse_panel_class_notes (panel_body): + result['class_notes'] = panel_body.xpath('text()').extract()[0].strip() + + def parse_panel_meeting_info (panel_body): + meet_info = panel_body.xpath('table') + meet_info = panel_body.xpath('tbody') or meet_info + meet_info = meet_info.xpath('tr[2]') + # print(meet_info.extract()) + if meet_info: + result['meet_times'] = meet_info.xpath('td[1]/text()').extract()[0].strip() + result['location'] = meet_info.xpath('td[2]/text()').extract()[0].strip() + result['instructor'] = meet_info.xpath('td[3]/text()').extract()[0].strip() + result['class_dates'] = meet_info.xpath('td[4]/text()').extract()[0].strip() + + def parse_panel_sections (panel_body): + pass + + def parse_panel_combined_sections (panel_body): + pass + + panels = content.xpath('div[contains(@class,"panel-group")]/div[contains(@class,"row")]') + for panel in panels: + header = panel.xpath('div[contains(@class,"panel-heading")]/h2/text()').extract()[0].strip() + body = panel.xpath('div[contains(@class,"panel-body")]') + try: + { + 'Class Details': parse_panel_class_details, + 'Description': parse_panel_description, + 'Enrollment Requirements': parse_panel_enrollment_reqs, + 'Class Notes': parse_panel_class_notes, + 'Meeting Information': parse_panel_meeting_info, + 'Combined Sections': parse_panel_combined_sections, + 'Associated Discussion Sections or Labs': parse_panel_sections, + }[header](body) + except KeyError: + raise Exception("Unhandled panel: '%s', with content:\n%s"%(header, body.extract())) + + yield result + self.pages_done += 1 + print("%d / %d"%(self.pages_done, self.pages_total)) diff --git a/crawlers/ucsc_old/ucsc/spiders/pisa_index_crawler.py b/crawlers/ucsc_old/ucsc/spiders/pisa_index_crawler.py new file mode 100644 index 0000000..27b3146 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/spiders/pisa_index_crawler.py @@ -0,0 +1,150 @@ +''' + This is a test for an API wrapper around scrapy. + It passes when the following code can run, and produce complete, correct output to the current crawler (pisa.py). +''' +from ucsc.architecture import BaseCrawler, item_producer, parser_entrypoint +from ucsc.items import PisaIndexItem +import scrapy +from scrapy.spiders import CrawlSpider +import datetime + +site_path = lambda path: '{}/{}'.format( + 'https://pisa.ucsc.edu/class_search', path) + +def to_datetime_time (hourly_time_string): + ''' Converts a time string generated by pisa into a python datetime.time value ''' + hours, minutes, am_pm = re.match(r'(\d+):(\d+)(AM|PM)') + if am_pm == 'PM': + hours += 12 + return datetime.time(hours, minutes) + +# TODO: unittests ^ + +class PisaCourseIndexCrawler (BaseCrawler): + @parser_entrypoint + def parse (self, response): + response.xpath_require_one('body') \ + .xpath_require_one('div[contains(@class,"center-block")]') \ + .xpath_require_one('div[@class="panel-body"]') \ + .xpath_require_many('div[contains(@id,"rowpanel")]') \ + .map_async(self.parse_index_item) + + @item_producer(PisaIndexItem) + def parse_index_item (self, response, result): + anchor = response \ + .xpath_require_one('div[contains(@class,"panel-heading")]') \ + .xpath_require_one('h2/a[contains(@id,"class_id_")]') + + anchor.xpath_attrib('@href').bind(result, 'url') + anchor.xpath_stripped_text().bind_re( + r'\s*(\w+\s+\d+[A-Z]?)[^\d]+(\d+)[^\w]+([^\n]+)', + result, + ('course_name', 'course_section', 'course_title')) + + content = response \ + .xpath_require_one('div[contains(@class,"panel-body")]') \ + .xpath_require_one('div[contains(@class,"row")]') + + content.xpath_require_many('div[@class="col-xs-6 col-sm-3"]') \ + .map_sequential_cases(check='maybe', cases=( + ('required', + lambda test: + test.xpath_stripped_text().equals("Class Number:") and \ + test.xpath_attrib('a/@id').matches_re(r'class_nbr_\d+') and \ + test.xpath_attrib('a/@href').matches_re( + r'https://pisa\.ucsc\.edu/class_search/index\.php\?action=detail&class_data=\w+'), + lambda value: value.xpath_stripped_text('a').to_int().bind(result, 'class_number')), + + ('required', + lambda test: + test.xpath_require_one('i[1]').xpath_attrib('@class').contains('fa-user') and \ + test.xpath_require_one('i[2]').xpath_attrib('@class').equals('sr-only') and \ + test.xpath_require_one('i[2]').xpath_stripped_text().equals('Instructor:'), + lambda value: value.xpath_stripped_text().bind(result, 'instructor')), + + ('required', + lambda test: + test.xpath_require_one('i[1]').xpath_attrib('@class').contains('fa-location-arrow') and \ + test.xpath_require_one('i[2]').xpath_attrib('@class').equals('sr-only') and \ + test.xpath_require_one('i[2]').xpath_stripped_text().equals('Day and Time:'), + lambda value: value.xpath_stripped_text().bind_re( + r'(LEC|DISC|LAB):\s+([\w\s]+)', + result, + ('class_type', 'location'))), + + ('required', + lambda test: + test.xpath_require_one('i[1]').xpath_attrib('@class').contains('fa-clock-o') and \ + test.xpath_require_one('i[2]').xpath_attrib('@class').equals('sr-only') and \ + test.xpath_require_one('i[2]').xpath_stripped_text().equals('Location:'), + lambda value: value.xpath_stripped_text().bind_re( + r'([M(?:Tu)W(?:Tr)F]+)\s+(\d+:\d+(?:PM|AM))', + result, + ('meet_days', 'meet_begin', 'meet_end'), + (lambda days: days.replace('Tr','R').replace('Tu','T'), to_time, to_time))), + + ('required', + lambda test: True, + # test.xpath_stripped_text().matches_re(r'\d+\s+of\d+\s+Enrolled'), + lambda value: value.xpath_stripped_text().bind_re_map( + r'(\d+)\s+of(\d+)\s+Enrolled', + result, + ('enroll_max', 'enroll_current'), + (int, int))) + )) + + # Simpler, but lest robust version: + content.xpath_stripped_text('div[1]/a').to_int().bind(result, 'class_number') + content.xpath_stripped_text('div[2]').bind(result, 'instructor') + content.xpath_stripped_text('div[3]').bind(result, 'location') + content.xpath_stripped_text('div[4]').bind(result, 'meet_times') + content.xpath_stripped_text('div[5]').bind_re_map( + r'\s*(\d+)\s+of\s+(\d+)', + ('enroll_current','enroll_max'), + (int,int) + ) + content.xpath_attrib('div[6]/a/@href').bind(result, 'materials_url') + response.request_async_crawl( + crawler=PisaCoursePageCrawler, + url=result['url']) + + +class PisaCoursePageCrawler (BaseCrawler): + def parse (self, request): + pass + + + +class pisa_index_crawler (scrapy.Spider): + name = 'pisa_index_crawler' + allowed_domains = ['pisa.ucsc.edu'] + search_url = site_path('index.php') + start_urls = [ search_url ] + + def __init__ (self, *args, **kwargs): + super(pisa_index_crawler, self).__init__(*args, **kwargs) + self.my_crawler = PisaCourseIndexCrawler() + + def parse(self, response): + yield scrapy.FormRequest(url=self.search_url, + formdata={'action':'results', + 'binds[:term]':'2188', + 'binds[:reg_status]':'all', + 'binds[:subject]':'', + 'binds[:catalog_nbr_op]':'=''', + 'binds[:catalog_nbr]':'', + 'binds[:title]':'', + 'binds[:instr_name_op]':'=''', + 'binds[:instructor]':'', + 'binds[:ge]':'', + 'binds[:crse_units_op]':'=''', + 'binds[:crse_units_from]':'', + 'binds[:crse_units_to]':'', + 'binds[:crse_units_exact]':'', + 'binds[:days]':'', + 'binds[:times]':'', + 'binds[:acad_career]':'', + 'binds[:session_code]':'', + 'rec_start': '0', + 'rec_dur': '1582'}, + callback=self.my_crawler.parse) diff --git a/crawlers/ucsc_old/ucsc/spiders/registrar_courses.py b/crawlers/ucsc_old/ucsc/spiders/registrar_courses.py new file mode 100644 index 0000000..bd80d00 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/spiders/registrar_courses.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +import scrapy +import os +from ucsc.items import FacultyItem, ProgramStatementItem, CourseDescriptionItem + + +def path_components (path): + if '://' in path: + path = path.split('://')[1] + parts = path.split('/') + while parts and parts[0] == '': + parts = parts[1:] + while parts and parts[-1] == '': + parts = parts[:-1] + return parts + +assert(path_components('') == []) +assert(path_components('/') == []) +assert(path_components('foo/') == ['foo']) +assert(path_components('/bar') == ['bar']) +assert(path_components('foo/bar') == ['foo','bar']) + +def merge_url (url, rel): + # note: blame seiji for all the issues with this code + thing = url.split('://')[0] if '://' in url else 'https' + if url and url[-1] == '/': + url = path_components(url) + else: + url = path_components(url)[:-1] + + for part in path_components(rel): + if part == '..': + url = url[:-1] + else: + url.append(part) + return thing + '://' + '/'.join(url) + +assert(merge_url('https://registrar.ucsc.edu/catalog/programs-courses/index.html', + '../foo/bar/../baz.html') == 'https://registrar.ucsc.edu/catalog/foo/baz.html') +assert(merge_url('', 'bar.baz') == 'https://bar.baz') +assert(merge_url('https://foo/bar/baz.html', '') == 'https://foo/bar') + +registrar_base_url = 'https://registrar.ucsc.edu/catalog/programs-courses' +base_course_description_url = 'https://registrar.ucsc.edu/catalog/programs-courses/course-descriptions' +base_faculty_url = 'https://registrar.ucsc.edu/catalog/programs-courses/faculty' +base_program_description_url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements' + +class RegistrarCoursesSpider(scrapy.Spider): + name = 'registrar_courses' + allowed_domains = ['registrar.ucsc.edu'] + start_urls = [merge_url(registrar_base_url, 'index.html')] + + def __init__(self, *args, **kwargs): + super(RegistrarCoursesSpider, self).__init__(*args, **kwargs) + self.crawled = set() + + def parse (self, response): + print("Parsing %s"%response.url) + + if base_course_description_url in response.url: + yield self.parse_course_info(response) + elif base_faculty_url in response.url: + yield self.parse_faculty_info(response) + elif base_program_description_url in response.url: + yield self.parse_program_info(response) + + all_links = response.xpath('//a') + for link in all_links: + #print("Got link: %s"%link.extract()) + try: + href = link.xpath('@href').extract()[0] + + def is_local_url (url): + for thing in ('http:','https:','C:','www','ucsc.edu'): + if thing in url: + return False + return True + + url = merge_url(response.url, href) if is_local_url(href) else href + if url in self.crawled: + continue + #print("Got URL: %s"%url) + self.crawled.add(url) + if registrar_base_url in url: + yield { 'url': url } + yield scrapy.Request(url, self.parse) + else: + pass + #print("Skipping %s"%url) + except IndexError: + pass + + def parse_course_info (self, response): + info = CourseDescriptionItem() + info['url'] = response.url + print("Got %s"%response.url) + return info + + def parse_faculty_info (self, response): + info = FacultyItem() + info['url'] = response.url + print("Got %s"%response.url) + return info + + def parse_program_info (self, response): + info = ProgramStatementItem() + info['url'] = response.url + print("Got %s"%response.url) + return info + + + +class Unused: + def parse(self, response): + # Get links to all course pages from the registrar + page_content = response\ + .xpath('body/div[@id="wrap"]/div[@id="container"]/div[@id="content"]')\ + .xpath('div[@id="sprflt"]/div[@id="main"]/div[contains(@class,"content")]') + panel_elems = page_content.xpath('table/tbody/tr/td') + + self.depts = {} + self.crawled = set() + for panel in panel_elems: + program_statements = panel.xpath('p/a') + for a in program_statements: + # print(a.xpath('@href').extract()) + dept = a.xpath('@href').re(r'program-statements/(\w+)\.html')[0] + title = a.xpath('text()').extract()[0] + url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements/%s.html'%dept + self.depts[dept] = title + self.crawled.add(url) + yield scrapy.Request(url, callback=self.parse_program_info) + #course_url = 'https://registrar.ucsc.edu/catalog/programs-courses/course-descriptions/%s.html'%dept + program_url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements/%s.html'%dept + faculty_url = 'https://registrar.ucsc.edu/catalog/programs-courses/faculty/%s.html'%dept + #yield scrapy.Request(course_url, callback=self.parse_course_info) + yield scrapy.Request(program_url, callback=self.parse_program_info) + yield scrapy.Request(faculty_url, callback=self.parse_faculty_info) + + def parse_program_info (self, response): + page_content = response\ + .xpath('body/div[@id="wrap"]/div[@id="container"]/div[@id="content"]')\ + .xpath('div[@id="sprflt"]/div[@id="main"]/div[contains(@class,"content")]') + + page_links = page_content.xpath('p[3]/a') + for a in page_links: + href, regex = a.xpath('@href'), r'\.\./([\w\-]+/\w+\.html)' + try: + page = href.re(regex)[0] + title = a.xpath('text()').extract()[0] + url = 'https://registrar.ucsc.edu/catalog/programs-courses/program-statements/%s'%page + print("\n%s: %s"%(url, title)) + except IndexError: + print("Could not match '%s' with '%s'"%(href, regex)) + content = page_content + #print("%s"%content.extract()[0]) + + def parse_course_info (self, response): + print("Got %s"%response.url) + + def parse_faculty_info (self, response): + print("Got %s"%response.url) diff --git a/crawlers/ucsc_old/ucsc/spiders/rmp_ucsc.py b/crawlers/ucsc_old/ucsc/spiders/rmp_ucsc.py new file mode 100644 index 0000000..9268646 --- /dev/null +++ b/crawlers/ucsc_old/ucsc/spiders/rmp_ucsc.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +import scrapy + + +class RmpUcscSpider(scrapy.Spider): + name = 'rmp-ucsc' + allowed_domains = ['http://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=University+of+California+Santa+Cruz&schoolID=1078&queryoption=TEACHER'] + start_urls = ['http://http://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=University+of+California+Santa+Cruz&schoolID=1078&queryoption=TEACHER/'] + + def parse(self, response): + pass diff --git a/crawlers/ucsc_old/ucsc/spiders/ucsc_registrar.py b/crawlers/ucsc_old/ucsc/spiders/ucsc_registrar.py new file mode 100644 index 0000000..4abae9e --- /dev/null +++ b/crawlers/ucsc_old/ucsc/spiders/ucsc_registrar.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +import scrapy + + +class UcscRegistrarSpider(scrapy.Spider): + name = 'ucsc-registrar' + allowed_domains = ['https://registrar.ucsc.edu/catalog/programs-courses/'] + start_urls = ['http://https://registrar.ucsc.edu/catalog/programs-courses//'] + + def parse(self, response): + pass diff --git a/crawlers/ucsd/ucsd_crawler.py b/crawlers/ucsd/ucsd_crawler.py new file mode 100644 index 0000000..16b98fc --- /dev/null +++ b/crawlers/ucsd/ucsd_crawler.py @@ -0,0 +1,298 @@ +from bs4 import BeautifulSoup +from urllib2 import urlopen +from pprint import pprint +import re + +def fetch_html (url, process_callback): + response = urlopen(url) + return process_callback(BeautifulSoup(response.read(), 'html.parser')) + +def enforce (condition, msg, *args): + if not condition: + raise Exception(msg % args) + +def get_catalog_course_pages (base_url): + index_url = '%s/front/courses.html'%base_url + def process (soup): + courses = {} + href_regex = re.compile(r'\.\./(courses/([^\.]+)\.html)') + for a in soup.find_all('a'): + if 'href' in a.attrs and 'title' in a.attrs: + match = re.match(href_regex, a.attrs['href']) + if match: + url, dept = match.group(1, 2) + url = '%s/%s'%(base_url, url) + title = a.attrs['title'] + courses[dept] = { 'url': url, 'title': title } + return courses + return fetch_html(index_url, process) + +dept_set = set() + +def get_page_courses (dept, item, output): + dept_lower = dept.lower() + course_regex = re.compile(r'([a-z]+)(\d+[a-z]?)') + + def getSiblingTextUntilNextAnchor (a): + text = '' + x = a.next + while x and x.name != 'a': + try: + text += x + except TypeError: + pass + x = x.next + return text + + def process_course (name, title, descr): + if not name or len(name) == 0: + enforce(not title and not descr, "Empty name '%s' for '%s', '%s'", name, title, descr) + return None + hits = descr.split("Prerequisites:") + prereqs = ". ".join(hits[1:]).strip().strip('.') + descr = hits[0] + prereq_requirements = set() + def requirement (*reqs): + def sub (stuff): + # for req in reqs: + # prereq_requirements.add(req) + return '' + return sub + + def course_case_multiple_and (match): + print("AND case:") + print(match.group(1, 2, 3)) + + def course_case_single (match): + print("SINGLE CASE: '%s' '%s'"%match.group(1, 2)) + + def course_case_concatenative_or (match): + print("OR CONCATENATIVE CASE: '%s' '%s'"%match.group(1, 2)) + + def course_case_concatenative_and (match): + print("AND CONCATENATIVE CASE: '%s' '%s'"%match.group(1, 2)) + + def parse_annoying_edge_case (match): + dept, course, suffixes = match.group(1, 2, 3) + match = re.match(r'(\d+)([A-Z\-]+)', course) + enforce(match, "Course invalid - something broke...? dept = '%s', course = '%s', suffixes = '%s'", + dept, course, suffixes) + prefix, suffix = match.group(1, 2) + suffixes = suffixes.strip().split() + print("PARSED ANNOYING EDGE CASE: dept, prefix = '%s', '%s'; suffixes = '%s', %s"%( + dept, prefix, suffix, suffixes)) + + def parse_fucking_ridiculous_everything_case (match): + # print("GOT RIDICULOUS CASE: '%s' '%s'"%(match.group(1), match.group(2))) + initial_string = match.group(0) + dept, courses = match.group(1, 2) + courses = re.sub(r'(and|or|[,;\-/])', ' ', courses).strip().split() + def splitCourseNumber (course): + match = re.match(r'(\d*)([A-Z]*)', course) + enforce(match, "Invalid course number: '%s' (for dept '%s', iniital string '%s'", + course, dept, initial_string) + return match.group(1, 2) + + dept_set.add(dept) + if not re.match(r'[A-Z]{2,}', dept): + try: + dept = { + 'Calculus': 'MATH', + 'Chem': 'CHEM', + 'Chemistry': 'CHEM', + 'Cog Sci': 'COGS', + 'Cognitive Science': 'COGS', + 'Economics': 'ECON', + 'Econ': 'ECON', + 'Enrollment Special Studies Courses': 'ESSC', + 'Hum': 'HUM', + 'Math': 'MATH', + 'Math Level': 'Math Level', + 'Mathematics': 'MATH', + 'Neurology': 'NEU', + 'Neurosci': 'NEU', + 'Neurosciences': 'NEU', + 'Pharm': 'PHARM', + 'Philosophy': 'PHIL', + 'Phys': 'PHYS', + 'Physics': 'PHYS', + 'Poli Sci': 'POLI', + 'Psyc': 'PSYC', + 'Psych': 'PSYC', + 'Psychology': 'PSYC', + 'Science': '??? Science', + 'Special Studies Courses': 'SSC', + 'G': 'G ???' + }[dept] + except KeyError: + enforce(False, "Unrecognized department '%s'", dept) + prevNumber = None + dept += ' ' + for course in courses: + n, a = splitCourseNumber(course) + if n: + prevNumber = n + else: + n = prevNumber + prereq_requirements.add(dept + n + a) + + + replace_cases = [ + (r'none', ''), + (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+((\d+[A-Z\-/]*)(\s+(and|or)\s+[A-Z])+)(?:\s+|$)', parse_annoying_edge_case), + # (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+(\d+[A-Z\-]*)', course_case_single), + # (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+(\d+[A-Z\-]*(?:\s+or\s+\d+[A-Z\-]*)*)', course_case_concatenative_or), + # (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+(\d+[A-Z\-]*(?:\s+and\s+\d+[A-Z\-]*)*)', course_case_concatenative_and), + (r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|[A-Z]+)\s+((?:\d+[A-Z\-/]*(?:,\s+|,?\s+(?:or|and)\s+))*\d+[A-Z\-/]*)', parse_fucking_ridiculous_everything_case), + # (r'([A-Z]\w+) ((\d\w+), )or (\d+\w+)', course_case_multiple_or), + + (r'[Ll]imited to BMS graduate students except by consent of instructor', requirement("GRADUATE_STANDING", "BMS_STUDENTS_ONLY", "INSTRUCTOR_APPROVAL")), + (r'[Ll]imited to senior undergraduates, graduate students, and medical students', requirement('GRADUATE_STANDING', 'SENIOR_STANDING', 'MEDICAL_STUDENT')), + (r'in bioengineering', requirement('BIOENGINEERING_MAJOR')), + (r'in bioengineering', requirement('SOCIOLOGY_MAJOR')), + (r'biological sciences', requirement("GRADUATE_STANDING", "BIOLOGICAL_SCIENCES_MAJOR")), + (r'standard undergraduate biology courses', requirement('BICD 1', 'BICD 2', 'BICD 3', 'BICD 4')), + (r'admission to Skaggs School of Pharmacy and Pharmaceutical Sciences or BMS Program \(major Code BS75\)', requirement('ADMITTED_SKAGGS_SCHOOL', 'BMS_STUDENT')), + (r'MAS( program| students?)?', requirement('ADMITTED_MAS_CLINICAL_RESEARCH_PROGRAM')), + + (r'completion of college writing', requirement('COMPLETED_COLLEGE_WRITING')), + (r'admission to the MAS Clinical Research Program', requirement("ADMITTED_MAS_CLINICAL_RESEARCH_PROGRAM")), + (r'admission to (the )?MFA theatre program', requirement("ADMITTED_MFA_THEATRE_PROGRAM")), + + (r'PhD', requirement('PHD_STANDING', 'GRADUATE_STANDING')), + (r'(for )?graduate( students?)?( standing| status)?( required)?', requirement('GRADUATE_STANDING')), + (r'([Uu]ndergraduates must be )?seniors?( standing)?( required)?', requirement('SENIOR_STANDING')), + (r'upper.division standing( required)?', requirement('UPPER_DIVISION_STANDING')), + (r'lower.division standing( required)?', requirement('LOWER_DIVISION_STANDING')), + (r'first.(year?)', requirement('REQUIRES_FIRST_YEAR_STUDENT')), + (r'second.(year?)', requirement('REQUIRES_SECOND_YEAR_STUDENT')), + (r'third.year', requirement('REQUIRES_THIRD_YEAR_STUDENT')), + (r'transfer standing( required)?', requirement('TRANSFER_STANDING')), + (r'for transfer students?', requirement('FOR_TRANSFER_STUDENTS')), + + (r'AuD student', requirement("AUD_MAJOR")), + (r'Economics ', requirement("ECONOMICS_MAJOR")), + (r'Rady', requirement("RADY_MAJOR")), + (r'admission to PhD program in theatre', requirement("ADMITTED_PHD_THEATRE_PROGRAM")), + (r'design students?( only)?', requirement("DESIGN_MAJOR")), + (r'psychology majors?( only)?', requirement("PSYCHOLOGY_MAJOR")), + (r'GPS student?( only)?', requirement("GPS_MAJOR")), + + (r'Sixth College (students?)?( only)?', requirement("SIXTH_COLLEGE")), + (r'Revelle College', requirement("REVELLE_COLLEGE")), + + (r'(consent of (the ))?[Dd]epartment(al)? (stamp|approval|chair)?( required)?', requirement('DEPARTMENT_APPROVAL')), + (r'(consent of )?[Ii]nstruct(or)?( approval)?', requirement('INSTRUCTOR_APPROVAL')), + (r'(program approval)', requirement('PROGRAM_APPROVAL')), + (r'((status or )?consent of graduate program director)', requirement('REQUIRES_GRADUATE_PROGRAM_DIRECTOR_APPROVAL')), + + (r'(by |through )?audition( required)?', requirement('REQUIRES_AUDITION')), + (r'(upper.division or graduate courses in molecular and cell biology)', requirement('UPPER_DIV_OR_GRADUATE_MCB_COURSES')), + (r'Restricted to students within the DS25 major', requirement("REQUIRES_DS25_MAJOR")), + (r'All other students will be allowed as space permits', requirement("OTHER_STUDENTS_ALLOWABLE_AS_SPACE_PERMITS")), + (r'enrollment in Science Studies Program', requirement("ENROLLED_IN_SCIENCES_STUDY_PROGRAM")), + (r'Bioengineering or Bioengineering: Biotechnology majors only', requirement("BIOENGINEERING_OR_BIOTECH_MAJORS_ONLY")), + (r'by invitation only', requirement("BY_INVITATION_ONLY")), + (r'MDE students only', requirement("MDE_STUDENTS_ONLY")), + (r'(with a )?grade of [A-Z]+.?( or better)?(, or equivalent)?',''), + (r'(or enrolled in|the department|or equivalent|(successful )?completion of)', ''), + (r'(in music)', ''), + (r'[Ee]nrollment (restricted to|by completion of prerequisites or by)', ''), + (r'\(S/U grades? (permitted|(option )?only)\.\)', ''), + (r'\([FWS](,[FWS])*\)', ''), + (r'^\s*((and|or|for|[,;\.\(\)])\s*)+$', ''), + ] + if prereqs: + original = prereqs + for r, s in replace_cases: + prereqs = re.sub(r, s, prereqs).strip() + # if prereqs: + # print(original) + # print("\t'%s'"%prereqs) + return { 'name': name, 'dept': name.split()[0], 'title': title, 'description': descr, 'prereqs': list(prereq_requirements) } + + def process (soup): + num_courses = 0 + for a in soup.find_all('a'): + try: + match = re.match(course_regex, a.attrs['id']) + if not match: + continue + text = getSiblingTextUntilNextAnchor(a).strip() + # print(text) + if '\n' in text: + items = text.split('\n') + header = items[0].strip() + descrip = items[1].strip() + # descrip = '\n'.join(items[1:]).strip() + else: + header, descrip = text.strip(), '' + # print(header) + if '.' in header: + items = header.split('.') + name = items[0].strip() + rest = '.'.join(items[1:]).strip() + else: + name, rest = header, '' + course = process_course(name, rest, descrip) + if course: + num_courses += 1 + output['courses'][course['name']] = course + except KeyError: + continue + print("%d / %d: Parsed '%s': %s courses"%( + item['item_index'] + 1, item['total_items'], item['url'], num_courses)) + return fetch_html(item['url'], process) + +def do_work (x): + get_page_courses(x['work_item']['dept'], x['work_item'], x) + return x['courses'] + +def fetch_ucsd_courses ( + base_url='http://ucsd.edu/catalog', + out_file=None, + parallelism=16, + return_results=True, +): + print("Fetching course pages...") + course_pages = get_catalog_course_pages(base_url) + print("Got %d pages from %s"%(len(course_pages), base_url)) + + for i, (k, x) in enumerate(course_pages.iteritems()): + course_pages[k]['item_index'] = i + course_pages[k]['total_items'] = len(course_pages) + course_pages[k]['dept'] = k + + output = { 'courses': {} } + if parallelism > 1: + from multiprocessing import Pool + pool = Pool(parallelism) + items = [ { 'courses': {}, 'work_item': item } for k, item in course_pages.iteritems() ] + courses = pool.map(do_work, items) + for result in courses: + output['courses'].update(result) + else: + for k, x in course_pages.iteritems(): + get_page_courses(k, x, output) + + if out_file: + import json + with open(out_file, 'w') as f: + json.dump(output, f) + print("Wrote %d courses to '%s'"%(len(output['courses']), out_file)) + + if return_results: + return output + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description='Fetches course data from the UCSD course catalog') + parser.add_argument('-o', '--out', type=str, help='output file', nargs='?', default='ucsd_courses.json') + parser.add_argument('-n', '--parallel', type=int, nargs='?', default=16) + args = parser.parse_args() + + fetch_ucsd_courses( + base_url = 'http://ucsd.edu/catalog', + out_file = args.out, + parallelism = args.parallel) diff --git a/crawlers/ucsd/ucsd_graph_gen.py b/crawlers/ucsd/ucsd_graph_gen.py new file mode 100644 index 0000000..793c573 --- /dev/null +++ b/crawlers/ucsd/ucsd_graph_gen.py @@ -0,0 +1,122 @@ +from ucsd_crawler import fetch_ucsd_courses +import json + +def generate_graph_data (courses, limit = -1): + edges = [] + nodes = [] + lookup_table = {} + + def insert_entity (name, info): + id = lookup_table[name] = len(nodes) + nodes.append({ + 'id': len(nodes), + 'label': name, + 'title': info['title'] if 'title' in info else '', + 'dept': info['dept'] if 'dept' in info else name.strip().split()[0], + 'description': info['description'] if 'description' in info else '', + 'edges_from': set(), + 'edges_to': set() + }) + + def lookup (name, info = {}): + if name not in lookup_table: + insert_entity(name, info) + return lookup_table[name] + + for course, info in courses.iteritems(): + if limit >= 0: + if limit == 0: + break + limit -= 1 + self = lookup(course, info) + for node in map(lookup, info['prereqs']): + edges += [{ 'from': node, 'to': self }] + nodes[self]['edges_from'].add(node) + nodes[node]['edges_to'].add(self) + + for i, _ in enumerate(nodes): + nodes[i]['edges_from'] = list(nodes[i]['edges_from']) + nodes[i]['edges_to'] = list(nodes[i]['edges_to']) + return { 'edges': edges, 'nodes': nodes } + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description='Generates vizjs graph data from the ucsd course catalog') + parser.add_argument('-i', '--input', type=str, help='input file', nargs='?', default='ucsd_courses.json') + parser.add_argument('-o', '--out', type=str, help='output file', nargs='?', default='ucsd_graph_data.json') + parser.add_argument('-r', '--rebuild', default=False, action='store_true') + parser.add_argument('-l', '--limit', type=int, default=-1) + parser.add_argument('-n', '--parallel', type=int, nargs='?', default=16) + parser.add_argument('--indent', type=int, nargs='?', default=0) + parser.add_argument('--sort_keys', type=bool, nargs='?', default=True) + parser.add_argument('-p', '--_print', default=False, action='store_true') + args = parser.parse_args() + + if args.rebuild: + from subprocess import call + call(['python', 'ucsd_crawler.py', '--out', str(args.input), '--parallel', str(args.parallel)]) + + if args.rebuild: + content = fetch_ucsd_courses( + out_file = args.input, + return_results = True, + parallelism = args.parallel + ) + else: + with open(args.input, 'r') as f: + content = json.loads(f.read()) + # print(len(content['courses'])) + courses = content['courses'] + + with open(args.out, 'w') as f: + graph_data = generate_graph_data(courses, limit=args.limit) + data = { + 'course_info': { + 'ucsd': { + 'courses': content['courses'], + 'vizjs': graph_data + } + } + } + # print(len(data)) + # print(len(data['nodes'])) + # print(len(data['edges'])) + # print(len(data['data'])) + if args.indent: + json.dump(data, f, indent=args.indent, sort_keys=args.sort_keys) + else: + json.dump(data, f, sort_keys=args.sort_keys) + if args._print: + if args.indent: + print(json.dumps(data, indent=args.indent, sort_keys=args.sort_keys)) + else: + print(json.dumps(data, sort_keys=args.sort_keys)) + + missing_references = {} + resolved_references = {} + for course, info in sorted(courses.iteritems(), key = lambda (k,v): k): + for name in info['prereqs']: + if name not in courses: + if name not in missing_references: + missing_references[name] = { 'count': 1, 'refby': set(), 'name': name } + else: + missing_references[name]['count'] += 1 + missing_references[name]['refby'].add(course) + else: + if name not in resolved_references: + resolved_references[name] = courses[name] + courses[name]['count'] = 1 + courses[name]['refby'] = set() + else: + resolved_references[name]['count'] += 1 + resolved_references[name]['refby'].add(course) + # print("%s resolved references"%(len(resolved_references))) + # for k, v in sorted(resolved_references.iteritems(), key = lambda (k, v): k): + # print("\t%s (%s references): %s"%(k, v['count'], ', '.join(v['refby']))) + + # print("\n%s missing references"%(len(missing_references))) + # for k, v in sorted(missing_references.iteritems(), key = lambda (k, v): k): + # print("\t%s (%s references): %s"%(k, v['count'], ', '.join(v['refby']))) + + + diff --git a/jest.config.js b/jest.config.js new file mode 100644 index 0000000..d6f14a5 --- /dev/null +++ b/jest.config.js @@ -0,0 +1,16 @@ +const TEST_REGEX = '(/__tests__/.*|(\\.|/)(test|spec))\\.(jsx?|js?|tsx?|ts?)$'; + +module.exports = { + setupFiles: ['/jest.setup.js'], + testRegex: TEST_REGEX, + transform: { + '^.+\\.jsx?$': 'babel-jest', + }, + testPathIgnorePatterns: [ + '/.next/', '/node_modules/', + ], + moduleFileExtensions: [ + 'ts', 'tsx', 'js', 'jsx', + ], + collectCoverage: true, +}; diff --git a/jest.setup.js b/jest.setup.js new file mode 100644 index 0000000..4887d3f --- /dev/null +++ b/jest.setup.js @@ -0,0 +1,4 @@ +const Enzyme = require('enzyme'); +const Adapter = require('enzyme-adapter-react-16'); + +Enzyme.configure({adapter: new Adapter()}); diff --git a/next.config.js b/next.config.js new file mode 100644 index 0000000..583ad48 --- /dev/null +++ b/next.config.js @@ -0,0 +1,10 @@ +module.exports = { + webpack: (config) => { + // Fixes npm packages that depend on `fs` module + config.node = { + fs: 'empty', + }; + + return config; + }, +}; diff --git a/package.json b/package.json new file mode 100644 index 0000000..daab75d --- /dev/null +++ b/package.json @@ -0,0 +1,67 @@ +{ + "name": "course-graph", + "scripts": { + "dev": "node server/index.js", + "pretest": "eslint . --ext js --ext jsx", + "test": "jest", + "test:ci": "jest --coverage --coverageReporters=text-lcov | coveralls", + "build": "next build", + "start": "NODE_ENV=production node server/index.js", + "docs": "jsdoc -c conf.json -d docs -R README.md components" + }, + "dependencies": { + "@material-ui/core": "^1.4.0", + "@material-ui/icons": "^2.0.0", + "algoliasearch": "^3.29.0", + "bcrypt-nodejs": "0.0.3", + "body-parser": "latest", + "compression": "^1.7.2", + "connect-mongo": "^2.0.1", + "cors": "^2.8.4", + "crypto": "^1.0.1", + "express": "^4.16.3", + "express-flash": "0.0.2", + "express-session": "^1.15.6", + "express-validator": "^5.2.0", + "isomorphic-unfetch": "^2.0.0", + "jss": "^9.8.7", + "lru-cache": "^4.1.3", + "mongoose": "^5.2.1", + "next": "latest", + "nprogress": "^0.2.0", + "passport": "^0.4.0", + "passport-local": "^1.0.0", + "prop-types": "^15.6.2", + "qs": "^6.5.2", + "react": "^16.4.1", + "react-dom": "^16.4.1", + "react-draggable": "^3.0.5", + "react-graph-vis": "^1.0.2", + "react-instantsearch": "^5.2.2", + "react-jss": "^8.6.1", + "react-particles-js": "^2.2.0", + "reactjs-popup": "^1.1.1", + "styled-jsx": "^2.2.7" + }, + "devDependencies": { + "@pixi/jsdoc-template": "^2.4.2", + "babel-core": "7.0.0-bridge.0", + "babel-eslint": "^8.2.5", + "babel-jest": "^23.2.0", + "coveralls": "^3.0.2", + "enzyme": "^3.3.0", + "enzyme-adapter-react-16": "^1.1.1", + "eslint": "^5.0.1", + "eslint-config-fbjs": "^2.0.1", + "eslint-plugin-babel": "^5.1.0", + "eslint-plugin-flowtype": "^2.49.3", + "eslint-plugin-jsx-a11y": "^6.1.0", + "eslint-plugin-react": "^7.10.0", + "eslint-plugin-relay": "0.0.24", + "jest": "^23.3.0", + "jsdoc": "^3.5.5", + "morgan": "^1.9.0", + "react-addons-test-utils": "^15.6.2", + "react-test-renderer": "^16.4.1" + } +} diff --git a/pages/_app.jsx b/pages/_app.jsx new file mode 100644 index 0000000..9ba9fa0 --- /dev/null +++ b/pages/_app.jsx @@ -0,0 +1,50 @@ +import React from 'react'; +import App, { Container } from 'next/app'; +import { MuiThemeProvider } from '@material-ui/core/styles'; +import CssBaseline from '@material-ui/core/CssBaseline'; +import JssProvider from 'react-jss/lib/JssProvider'; +import getPageContext from '../utils/getPageContext'; + +class MyApp extends App { + constructor(props) { + super(props); + this.pageContext = getPageContext(); + } + + pageContext = null; + + componentDidMount() { + // Remove the server-side injected CSS. + const jssStyles = document.querySelector('#jss-server-side'); + if (jssStyles && jssStyles.parentNode) { + jssStyles.parentNode.removeChild(jssStyles); + } + } + + render() { + const {Component, pageProps} = this.props; + return ( + + {/* Wrap every page in Jss and Theme providers */} + + {/* MuiThemeProvider makes the theme available down the React + tree thanks to React context. */} + + + {/* Pass pageContext to the _document though the renderPage enhancer + to render collected styles on server side. */} + + + + + ); + } +} + +export default MyApp; diff --git a/pages/_document.jsx b/pages/_document.jsx new file mode 100644 index 0000000..2beb9ca --- /dev/null +++ b/pages/_document.jsx @@ -0,0 +1,94 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import Document, { Head, Main, NextScript } from 'next/document'; +import flush from 'styled-jsx/server'; + +class MyDocument extends Document { + render() { + const {pageContext} = this.props; + + return ( + + + Course Graph + + {/* Use minimum-scale=1 to enable GPU rasterization */} + + {/* PWA primary color */} + + + + +
+ + + + ); + } +} + +MyDocument.getInitialProps = ctx => { + // Resolution order + // + // On the server: + // 1. app.getInitialProps + // 2. page.getInitialProps + // 3. document.getInitialProps + // 4. app.render + // 5. page.render + // 6. document.render + // + // On the server with error: + // 1. document.getInitialProps + // 2. app.render + // 3. page.render + // 4. document.render + // + // On the client + // 1. app.getInitialProps + // 2. page.getInitialProps + // 3. app.render + // 4. page.render + + let pageContext; + const page = ctx.renderPage(Component => { + const WrappedComponent = props => { + pageContext = props.pageContext; + return ; + }; + + WrappedComponent.propTypes = { + pageContext: PropTypes.object.isRequired, + }; + + return WrappedComponent; + }); + + return { + ...page, + pageContext, + // Styles fragment is rendered after the app and page rendering finish. + styles: ( + + + + +
+ ); + } +} + +export default withStyles(styles)(IndexPage); diff --git a/pages/ucsc/index.jsx b/pages/ucsc/index.jsx new file mode 100644 index 0000000..2ad8be8 --- /dev/null +++ b/pages/ucsc/index.jsx @@ -0,0 +1,80 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import Router from 'next/router'; +import qs from 'qs'; + +import App from '../../components/Search'; +import { findResultsState } from '../../components/Instantsearch'; +import Header from '../../components/Header'; + +/** + * @type {number} + */ +const updateAfter = 700; + +/** + * @param searchState + * @return {string} the url + */ +const searchStateToUrl = searchState => + searchState ? `${window.location.pathname}?${qs.stringify(searchState)}` : ''; + +/** + * Search Page + */ +export default class extends React.Component { + static propTypes = { + resultsState: PropTypes.object, + searchState: PropTypes.object, + }; + + onSearchStateChange = (searchState) => { + clearTimeout(this.debouncedSetState); + this.debouncedSetState = setTimeout(() => { + const href = searchStateToUrl(searchState); + Router.push(href, href, { + shallow: true, + }); + }, updateAfter); + this.setState({searchState}); + }; + + constructor() { + super(); + this.onSearchStateChange = this.onSearchStateChange.bind(this); + } + + static async getInitialProps(params) { + const searchState = qs.parse( + params.asPath.substring(params.asPath.indexOf('?') + 1) + ); + const resultsState = await findResultsState(App, {searchState}); + + return {resultsState, searchState}; + } + + render() { + return ( +
+
+ + +

Search

+
+ +
+
+ ); + } +} diff --git a/pages/ucsd/index.jsx b/pages/ucsd/index.jsx new file mode 100644 index 0000000..17ad8ce --- /dev/null +++ b/pages/ucsd/index.jsx @@ -0,0 +1,67 @@ +import React from 'react'; +import PropTypes from 'prop-types'; +import fetch from 'isomorphic-unfetch'; + +import { withStyles } from '@material-ui/core/styles'; + +import GraphViewAssembly from '../../components/graph/GraphViewAssembly'; +import Header from '../../components/Header'; + +/** + * Define the style of components on this page + * @param theme + * @return {object} + */ +const styles = theme => ({ + wrapper: { + 'text-align': 'center', + }, + appBar: { + position: 'absolute', + transition: theme.transitions.create(['margin', 'width'], { + easing: theme.transitions.easing.sharp, + duration: theme.transitions.duration.leavingScreen, + }), + }, +}); + +class GraphPage extends React.Component { + static propTypes = { + classes: PropTypes.object.isRequired, + graphData: PropTypes.object.isRequired, + }; + + /** + * @param req + * @param query + * @return {Promise<*>} + */ + static async getInitialProps({req, query}) { + const isServer = !!req; + + // const URL = 'http://localhost:8080/api/graph-data/ucsd'; + // const URL = 'https://coursegraph.org/api/graph-data/ucsd'; + + if (isServer) { + return {graphData: query.itemData}; + } else { + const res = await fetch('https://coursegraph.org/api/graph-data/ucsd'); + const json = await res.json(); + return {graphData: json}; + } + } + + render() { + const {classes, graphData} = this.props; + + return ( +
+
+ + +
+ ); + } +} + +export default withStyles(styles)(GraphPage); diff --git a/server/config/passport.js b/server/config/passport.js new file mode 100644 index 0000000..e7e002f --- /dev/null +++ b/server/config/passport.js @@ -0,0 +1,52 @@ +const passport = require('passport'); +const {Strategy: LocalStrategy} = require('passport-local'); + +const User = require('../models/user'); + +passport.serializeUser((user, done) => { + done(null, user.id); +}); + +passport.deserializeUser((id, done) => { + User.findById(id, (err, user) => { + done(err, user); + }); +}); + +/** + * Sign in using Email and Password. + */ +passport.use(new LocalStrategy({usernameField: 'email'}, + (email, password, done) => { + User.findOne({email: email.toLowerCase()}, (err, user) => { + if (err) { + return done(err); + } + + if (!user) { + return done(null, false, {msg: `Email ${email} not found.`}); + } + + return user.comparePassword(password, (err, isMatch) => { + if (err) { + return done(err); + } + if (isMatch) { + return done(null, user); + } + return done(null, false, {msg: 'Invalid email or password.'}); + }); + }); + })); + +/** + * Login Required middleware. + */ +exports.isAuthenticated = (req, res, next) => { + if (req.isAuthenticated()) { + return next(); + } + + return res.redirect('/account/login'); +}; + diff --git a/server/controllers/courses.js b/server/controllers/courses.js new file mode 100644 index 0000000..be52281 --- /dev/null +++ b/server/controllers/courses.js @@ -0,0 +1,33 @@ +const UCSC = require('../models/ucsc_course'); +const UCSD = require('../models/ucsd_courses'); + +/** + * @type {Map.} + */ +let schoolMap = new Map([ + ['UCSC', UCSC], + ['UCSD', UCSD], +]); + +/** + * GET / courses + */ +exports.getCourses = (req, res) => { + const schoolName = req.params.id || 'UCSD'; + const school = schoolMap.get(schoolName.toUpperCase()); + + // console.log(`Asking for ${schoolName}...`); + + if (school) { + school.find({}).lean().exec((err, course) => { + if (err) { + return console.error(err); + } + return res.json(course); + }); + } else { + return res.json([]); + } +}; + + diff --git a/server/controllers/home.js b/server/controllers/home.js new file mode 100644 index 0000000..40a8eee --- /dev/null +++ b/server/controllers/home.js @@ -0,0 +1,11 @@ +/** + * GET / + * Home page. + * @param app {App} + * @return {Function} + */ +exports.index = (app) => (req, res) => { + // check if logged in already, + + res.redirect('/'); +}; diff --git a/server/controllers/user.js b/server/controllers/user.js new file mode 100644 index 0000000..3174653 --- /dev/null +++ b/server/controllers/user.js @@ -0,0 +1,150 @@ +const {promisify} = require('util'); +const crypto = require('crypto'); +const passport = require('passport'); + +const User = require('../models/user'); + +const randomBytesAsync = promisify(crypto.randomBytes); + +/** + * GET /login + * Login page. + * @param app {App} Next app + * @return {Function} + */ +exports.getLogin = (app) => (req, res) => { + if (req.user) { + return res.redirect('/'); + } + + return app.render(req, res, '/account/login', { + title: 'Login', + }); +}; + +/** + * POST /login + * Sign in using email and password. + * @param app {App} Next app + * @return {Function} + */ +exports.postLogin = (app) => (req, res, next) => { + req.assert('email', 'Email is not valid').isEmail(); + req.assert('password', 'Password cannot be blank').notEmpty(); + req.sanitize('email').normalizeEmail({gmail_remove_dots: false}); + + const errors = req.validationErrors(); + + if (errors) { + // req.flash('errors', errors); + return res.redirect('/account/error'); // login + } + + console.log('ready'); + + passport.authenticate('local', { + successRedirect: '/', + failureRedirect: '/account/login' + } + ); + + console.log('go'); + + return res.redirect('/'); + + // return passport.authenticate('local', (err, user, info) => { + // if (err) { + // return next(err); + // } + // + // if (!user) { + // // req.flash('errors', info); + // return res.redirect('/account/error'); + // } + // + // return req.logIn(user, (err) => { + // if (err) { + // return next(err); + // } + // + // // req.flash('success', {msg: 'Success! You are logged in.'}); + // return res.redirect(req.session.returnTo || '/'); + // }); + // })(req, res, next); +}; + +/** + * GET /signup + * Signup page. + * @param app + * @return {Function} + */ +exports.getSignup = (app) => (req, res) => { + if (req.user) { + return res.redirect('/'); + } + + return app.render(req, res, '/account/signup', { + title: 'Create Account', + }); +}; + +/** + * POST /signup + * Create a new local account. + * @param app + * @return {Function} + */ +exports.postSignup = (app) => (req, res, next) => { + req.assert('email', 'Email is not valid').isEmail(); + req.assert('password', 'Password must be at least 4 characters long').len(4); + req.assert('confirmPassword', 'Passwords do not match').equals( + req.body.password); + req.sanitize('email').normalizeEmail({gmail_remove_dots: false}); + + const errors = req.validationErrors(); + + console.log(req.body); + console.log(errors); + + if (errors) { + // req.flash('errors', errors); + return res.redirect('/account/error'); + } + + /** + * @type {Model} + */ + const user = new User({ + email: req.body.email, + password: req.body.password, + }); + + // Check if the user already exist + return User.findOne({email: req.body.email}, (err, existingUser) => { + if (err) { + return next(err); + } + + if (existingUser) { + req.flash('errors', { + msg: 'Account with that email address already exists.', + }); + return res.redirect('/account/error'); + } + + return user.save((err) => { + if (err) { + return next(err); + } + + return req.logIn(user, (err) => { + if (err) { + return next(err); + } + + return res.redirect('/'); + }); + }); + }); +}; diff --git a/server/index.js b/server/index.js new file mode 100644 index 0000000..bfcba7b --- /dev/null +++ b/server/index.js @@ -0,0 +1,192 @@ +/** + * Module dependency + */ +const express = require('express'); +const next = require('next'); +const compression = require('compression'); +const session = require('express-session'); +const bodyParser = require('body-parser'); +const mongoose = require('mongoose'); +const passport = require('passport'); +const expressValidator = require('express-validator'); +const LRUCache = require('lru-cache'); +const logger = require('morgan'); +const flash = require('express-flash'); +const MongoStore = require('connect-mongo')(session); +const cors = require('cors'); + +/** + * Controllers + */ +// const homeController = require('./controllers/home'); +const courseController = require('./controllers/courses'); +const userController = require('./controllers/user'); +const api = require('./operations/get_graph_data'); + +/** + * Constant Settings + */ +const PORT = parseInt(process.env.PORT, 10) || 8080; +const dev = process.env.NODE_ENV !== 'production'; + +const app = next({dev}); +const defaultRequestHandler = app.getRequestHandler(); + +const LOCAL_DB = 'courses'; +const MONGODB_URI = process.env.MONGODB_URI || `mongodb://localhost:27017/${LOCAL_DB}`; + +/** + * API keys and Passport configuration. + */ +const passportConfig = require('./config/passport'); + +/** + * This is where we cache our rendered HTML pages + * @type {LRUCache} + */ +const ssrCache = new LRUCache({ + max: 100, + maxAge: 1000 * 60 * 60, // 1hour +}); + +app.prepare() + .then(() => { + /** + * Create Express server. + */ + const server = express(); + + /** + * Express configuration. + */ + server.use(bodyParser.json()); + server.use(expressValidator()); + server.use(compression()); + server.use(logger('dev')); + server.use(session({ + resave: true, + saveUninitialized: true, + secret: 'I LOVE CMPS115', + cookie: {maxAge: 1209600000}, // two weeks in milliseconds + store: new MongoStore({ + url: MONGODB_URI, + autoReconnect: true, + }), + })); + server.use(passport.initialize()); + server.use(flash()); + server.use(cors()); + + /** + * Connect to MongoDB. + */ + mongoose.Promise = Promise; + mongoose.connect(MONGODB_URI, {useNewUrlParser: true}); + + const db = mongoose.connection; + + db.on('error', console.error.bind(console, 'connection error:')); + server.use((req, res, next) => { + // Expose the MongoDB database handle so Next.js can access it. + req.db = db; + next(); + }); + + /** + * Primary app routes. + */ + server.get('/', (req, res) => { + renderAndCache(req, res, '/'); + }); + + server.get('/account/login', userController.getLogin(app)); + server.post('/account/login', userController.postLogin(app)); + server.get('/account/signup', userController.getSignup(app)); + server.post('/account/signup', userController.postSignup(app)); + + server.get('/foo', passportConfig.isAuthenticated, (req, res) => { + res.send('hello world'); + }); + + server.get('/ucsc', (req, res) => { + renderAndCache(req, res, '/ucsc'); + }); + + server.get('/ucsd', (req, res) => { + const itemData = api.getGraphData(); + // renderAndCache(req, res, '/ucsd', {itemData: itemData}); + app.render(req, res, '/ucsd', {itemData: itemData}); + }); + + /** + * API routes. + */ + server.get('/api/courses/:id', courseController.getCourses); + server.get('/api/graph-data/:school', (req, res) => { + const itemData = api.getGraphData(req.params.school); + res.json(itemData); + }); + + /** + * Fall-back on other next.js assets. + */ + server.get('*', (req, res) => { + return defaultRequestHandler(req, res); + }); + + /** + * Start Express server. + */ + server.listen(PORT, (err) => { + if (err) { + throw err; + } + console.log(`> Ready on http://localhost:${PORT}`); + }); + }).catch(error => console.error(error.stack)); + + +/** + * @param req + * @return {string} + */ +function getCacheKey(req) { + return `${req.url}`; +} + +/** + * @param req + * @param res + * @param pagePath + * @param queryParams + * @return {Promise} + */ +async function renderAndCache(req, res, pagePath, queryParams) { + const key = getCacheKey(req); + + // If we have a page in the cache, let's serve it + if (ssrCache.has(key)) { + res.setHeader('x-cache', 'HIT'); + res.send(ssrCache.get(key)); + return; + } + + try { + // If not let's render the page into HTML + const html = await app.renderToHTML(req, res, pagePath, queryParams); + + // Something is wrong with the request, let's skip the cache + if (res.statusCode !== 200) { + res.send(html); + return; + } + + // Let's cache this page + ssrCache.set(key, html); + + res.setHeader('x-cache', 'MISS'); + res.send(html); + } catch (err) { + app.renderError(err, req, res, pagePath, queryParams); + } +} diff --git a/server/models/ucsc_course.js b/server/models/ucsc_course.js new file mode 100644 index 0000000..0d468af --- /dev/null +++ b/server/models/ucsc_course.js @@ -0,0 +1,17 @@ +const mongoose = require('mongoose'); + +const UCSCCourseSchema = new mongoose.Schema({ + description: {type: String}, + division: {type: String}, + geCategories: {type: String}, + instructor: {type: String}, + name: {type: String}, + terms: {type: String}, + title: {type: String}, +}, { + collection: 'ucsc', +}); + +const Course = mongoose.model('UCSCCourse', UCSCCourseSchema); + +module.exports = Course; diff --git a/server/models/ucsd_courses.js b/server/models/ucsd_courses.js new file mode 100644 index 0000000..9e4ecf9 --- /dev/null +++ b/server/models/ucsd_courses.js @@ -0,0 +1,15 @@ +const mongoose = require('mongoose'); + +const UCSDCourseSchema = new mongoose.Schema({ + dept: String, + description: String, + name: String, + prereqs: [String], + title: String, +}, { + collection: 'ucsd', +}); + +const Course = mongoose.model('UCSDCourseSchema', UCSDCourseSchema); + +module.exports = Course; diff --git a/server/models/user.js b/server/models/user.js new file mode 100644 index 0000000..291c127 --- /dev/null +++ b/server/models/user.js @@ -0,0 +1,49 @@ +const bcrypt = require('bcrypt-nodejs'); +const mongoose = require('mongoose'); + +const userSchema = new mongoose.Schema({ + email: {type: String, unique: true}, + password: String, + + profile: { + courses: Array, + }, +}, {timestamps: true}); + +/** + * Password hash middleware. + */ +userSchema.pre('save', (next) => { + const user = this; + // if (!user.isModified('password')) { + // return next(); + // } + + return bcrypt.genSalt(10, (err, salt) => { + if (err) { + return next(err); + } + + return bcrypt.hash(user.password, salt, null, (err, hash) => { + if (err) { + return next(err); + } + + user.password = hash; + return next(); + }); + }); +}); + +/** + * Helper method for validating user's password. + */ +userSchema.methods.comparePassword = (candidatePassword, cb) => { + bcrypt.compare(candidatePassword, this.password, (err, isMatch) => { + cb(err, isMatch); + }); +}; + +const User = mongoose.model('User', userSchema); + +module.exports = User; diff --git a/server/operations/count_graph_data.js b/server/operations/count_graph_data.js new file mode 100644 index 0000000..e50c935 --- /dev/null +++ b/server/operations/count_graph_data.js @@ -0,0 +1,33 @@ +const fs = require('fs'); + +// { +// "dept": "SOCD", +// "description": "", +// "edges_from": [], +// "edges_to": [ +// 74 +// ], +// "id": 75, +// "label": "SOCD 158", +// "title": "" +// }, + +// let count = 0; +let data = JSON.parse(fs.readFileSync('../../data/ucsd_graph_data.json', 'utf8')); + +let deptSet = new Set(); + +for (const obj of data.nodes) { + if (obj.dept) { + deptSet.add(obj.dept); + } +} + +let text = '[\n'; +for (const dept of deptSet) { + text += `"${dept}",\n`; +} +text += ']'; + +console.log(`Saved ${deptSet.size}`); +console.log(text); diff --git a/server/operations/gen_actual_data.js b/server/operations/gen_actual_data.js new file mode 100644 index 0000000..c088f46 --- /dev/null +++ b/server/operations/gen_actual_data.js @@ -0,0 +1,59 @@ +const fs = require('fs'); +const mongoose = require('mongoose'); + +const UCSC = require('../models/ucsc_course'); +const UCSD = require('../models/ucsd_courses'); + +/** + * "course_info": { + "ucsd": { + "courses": { + * @param data + */ +function parse_ucsd_courses(data) { + const courses = Object.values(data.course_info.ucsd.courses); + + let arrOfCourses = []; + + for (const course of courses) { + arrOfCourses.push(course); + } + + return arrOfCourses; +} + +function gen() { + + mongoose.connect('mongodb://localhost:27017/courses', {useNewUrlParser: true}); + let db = mongoose.connection; + + db.on('error', console.error.bind(console, 'connection error:')); + db.once('open', () => { + //ucsd + let count = 0; + let data = JSON.parse(fs.readFileSync('../../data/courses.json', 'utf8')); + + data.forEach((obj) => { + let thing = new UCSC(obj); + count++; + thing.save(); + }); + + console.log(`Saved ${count}`); + + // ucsd + count = 0; + data = JSON.parse(fs.readFileSync('../../data/ucsd_all_data.json', 'utf8')); + data = parse_ucsd_courses(data); + + data.forEach((obj) => { + let thing = new UCSD(obj); + count++; + thing.save(); + }); + + console.log(`Saved ${count}`); + }); +} + +gen(); diff --git a/server/operations/get_course_db.js b/server/operations/get_course_db.js new file mode 100644 index 0000000..f5892ca --- /dev/null +++ b/server/operations/get_course_db.js @@ -0,0 +1,22 @@ +const Course = require('../models/ucsc_course'); + +/** + * @return {Array.} + */ +function get() { + let arr = []; + + Course.find({}).lean().exec((err, course) => { + if (err) { + console.error(err); + return; + } + + arr.push(course); + }); + + + return arr; +} + +module.exports = get; diff --git a/server/operations/get_graph_data.js b/server/operations/get_graph_data.js new file mode 100644 index 0000000..b60f1ef --- /dev/null +++ b/server/operations/get_graph_data.js @@ -0,0 +1,28 @@ +const fs = require('fs'); +const path = require('path'); + +const data = fs.readFileSync( + path.join(__dirname, '../../data/ucsd_graph_data.json'), 'utf8'); + +/** + * @type {Map.} + */ +const schoolMap = new Map([ + ['UCSD', data], +]); + +/** + * @param school {string} + * @return {object} + */ +function getGraphData(school = 'UCSD') { + const graphData = schoolMap.get(school.toUpperCase()); + + if (!graphData) { + return {}; + } + + return JSON.parse(graphData); +} + +module.exports = {getGraphData}; diff --git a/server/operations/parse_graph_data.js b/server/operations/parse_graph_data.js new file mode 100644 index 0000000..6859542 --- /dev/null +++ b/server/operations/parse_graph_data.js @@ -0,0 +1,20 @@ +const fs = require('fs'); + + +const data = JSON.parse( + fs.readFileSync('../../data/ucsd_all_data.json', 'utf8')); + +const vizjs = data.course_info.ucsd.vizjs; + +// for (const entry of Object.entries(courses)) { +// let key = entry[0]; +// let value = entry[1]; +// } + +fs.writeFile('../../data/ucsd_graph_data.json', JSON.stringify(vizjs), (err) => { + if (err) { + throw err; + } + console.log('The file has been saved!'); +}); + diff --git a/server/operations/parse_raw_data.js b/server/operations/parse_raw_data.js new file mode 100644 index 0000000..2b0eeb7 --- /dev/null +++ b/server/operations/parse_raw_data.js @@ -0,0 +1,67 @@ +const fs = require('fs'); + +const data = JSON.parse(fs.readFileSync('../../data/data.json', 'utf8')); + +const courseTemplate = Object.keys({ + description: '', + division: '', + geCategories: '', + instructor: '', + name: '', + terms: '', + title: '', +}); + +let arr = []; + +/** + * @param obj + * @return {boolean} + */ +function check(obj) { + courseTemplate.forEach((template) => { + if (!obj.hasOwnProperty(template)) { + if (typeof obj[template] !== 'string') { + return false; + } + } + }); + + return true; +} + +function parseCourse(data) { + let courses = Object.values(data); + for (const obj of courses) { + if (check) { + arr.push(obj); + } else { + console.log('Something wrong with the data: '); + console.log(data); + } + } +} + +let count = 0; +for (const pair of Object.entries(data)) { + let key = pair[0]; + let courses = pair[1].courses; + + count++; + if (courses) { + console.log(`${key}: ${Object.keys(courses).length}`); + + parseCourse(courses); + } else { + console.log(`${key}`); + } +} +console.log(count); + +fs.writeFile('../../data/courses.json', JSON.stringify(arr), (err) => { + if (err) { + throw err; + } + console.log('The file has been saved!'); +}); + diff --git a/static/favicon.ico b/static/favicon.ico new file mode 100644 index 0000000..3e264cd Binary files /dev/null and b/static/favicon.ico differ diff --git a/static/instantsearch.css b/static/instantsearch.css new file mode 100644 index 0000000..fa77f2f --- /dev/null +++ b/static/instantsearch.css @@ -0,0 +1,53 @@ +.ais-InstantSearch__root { + align-items: center; +} + +header { + display: flex; + flex-direction: column; + align-items: center; +} + +content { + display: flex; + margin: 25px 0; +} + +menu { + flex: 2; +} + +footer { + text-align: center; +} + +.ais-Pagination { + margin-bottom: 25px; +} + +results { + flex: 10; +} + +.hit { + display: flex; + align-items: center; +} + +.hit-actions { + display: flex; +} + +.hit-content { + padding: 0px 10px; +} + +.hit-picture img { + width: 80px; + height: 80px; +} + +.hit-type { + color: #888888; + font-size: 13px; +} diff --git a/static/nprogress.css b/static/nprogress.css new file mode 100644 index 0000000..29f9d1f --- /dev/null +++ b/static/nprogress.css @@ -0,0 +1,82 @@ +/* Make clicks pass-through */ +#nprogress { + pointer-events: none; +} + +#nprogress .bar { + background: #29d; + + position: fixed; + z-index: 1031; + top: 0; + left: 0; + + width: 100%; + height: 2px; +} + +/* Fancy blur effect */ +#nprogress .peg { + display: block; + position: absolute; + right: 0px; + width: 100px; + height: 100%; + box-shadow: 0 0 10px #29d, 0 0 5px #29d; + opacity: 1.0; + + -webkit-transform: rotate(3deg) translate(0px, -4px); + -ms-transform: rotate(3deg) translate(0px, -4px); + transform: rotate(3deg) translate(0px, -4px); +} + +/* Remove these to get rid of the spinner */ +#nprogress .spinner { + display: block; + position: fixed; + z-index: 1031; + top: 15px; + right: 15px; +} + +#nprogress .spinner-icon { + width: 18px; + height: 18px; + box-sizing: border-box; + + border: solid 2px transparent; + border-top-color: #29d; + border-left-color: #29d; + border-radius: 50%; + + -webkit-animation: nprogress-spinner 400ms linear infinite; + animation: nprogress-spinner 400ms linear infinite; +} + +.nprogress-custom-parent { + overflow: hidden; + position: relative; +} + +.nprogress-custom-parent #nprogress .spinner, +.nprogress-custom-parent #nprogress .bar { + position: absolute; +} + +@-webkit-keyframes nprogress-spinner { + 0% { + -webkit-transform: rotate(0deg); + } + 100% { + -webkit-transform: rotate(360deg); + } +} + +@keyframes nprogress-spinner { + 0% { + transform: rotate(0deg); + } + 100% { + transform: rotate(360deg); + } +} diff --git a/utils/getPageContext.js b/utils/getPageContext.js new file mode 100644 index 0000000..0852635 --- /dev/null +++ b/utils/getPageContext.js @@ -0,0 +1,53 @@ +/* eslint-disable no-underscore-dangle */ + +import { SheetsRegistry } from 'jss'; +import { + createGenerateClassName, + createMuiTheme, +} from '@material-ui/core/styles'; +import purple from '@material-ui/core/colors/purple'; +import green from '@material-ui/core/colors/green'; + +// A theme with custom primary and secondary color. +// It's optional. +const theme = createMuiTheme({ + palette: { + primary: { + light: purple[300], + main: purple[500], + dark: purple[700], + }, + secondary: { + light: green[300], + main: green[500], + dark: green[700], + }, + }, +}); + +function createPageContext() { + return { + theme, + // This is needed in order to deduplicate the injection of CSS in the page. + sheetsManager: new Map(), + // This is needed in order to inject the critical CSS. + sheetsRegistry: new SheetsRegistry(), + // The standard class name generator. + generateClassName: createGenerateClassName(), + }; +} + +export default function getPageContext() { + // Make sure to create a new context for every server-side request so that da + // isn't shared between connections (which would be bad). + if (!process.browser) { + return createPageContext(); + } + + // Reuse context on the client-side. + if (!global.__INIT_MATERIAL_UI__) { + global.__INIT_MATERIAL_UI__ = createPageContext(); + } + + return global.__INIT_MATERIAL_UI__; +}