-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.js
89 lines (79 loc) · 3.21 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
// store presidents and dates of death in an Object, used as a dictionary
var presidents = new Object();
app.get('/scrape', function(req, res) {
// The OG President
page = "https://en.wikipedia.org/wiki/George_Washington";
make_request(page);
res.send('Check Terminal for output');
});
function make_request(url) {
// The structure of our request call
// The first parameter is our URL
// The callback function takes 3 parameters, an error, response status code and the html
request(url, function(error, response, html) {
// check to make sure no errors occurred when making the request
if(!error) {
// utilize the cheerio library on the returned html which will essentially give us jQuery functionality
var $ = cheerio.load(html);
// get name
name = $('table.infobox.vcard').first().find('span.fn').text();
// get death date
date = $('span.dday.deathdate').first().text();
// store president and date
presidents[name] = date;
// iterate through rows of table
var rows = $('table.infobox.vcard').find("tr");
for (var i = 0; i < rows.length; i++) {
// if last president, write all to file
if ($(rows[i]).children('th').text().indexOf("President of the United States") > -1) {
if ($(rows[i+1]).children('td').text().indexOf("Incumbent") > -1) {
create_output();
break; // exit loop; last president has been found
}
}
// otherwise, navigate to next president
if ($(rows[i]).children('th').text() === "Succeeded by") {
display_progress();
next_pres_href = $(rows[i]).children('td').first().children('a').first().attr('href');
page = "https://en.wikipedia.org" + next_pres_href;
make_request(page);
break; // exit loop; first instance is what we're looking for
}
}
}
});
}
// just displays " working..." with progressive ellipsis
function display_progress() {
if ( typeof display_progress.counter == 'undefined' ) {
display_progress.counter = 0;
}
display_progress.counter++;
process.stdout.write(" working"
+ function(){
var ellipsis = "";
for (var i = 0; i < display_progress.counter % 4; i++) {
ellipsis += ".";
}
return ellipsis;
}()
+ " " // add spaces to clear previous periods
+ "\r"); // add return carriage to move cursor to the beginning of the line
}
function create_output() {
fs.writeFile("presideaths.json", JSON.stringify(presidents, null, 4), function(err) {
if(err) {
return console.log(err);
}
console.log("Data saved in presideaths.json");
process.exit();
});
}
app.listen('8081');
console.log('Open your port 8081');
exports = module.exports = app;