-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcrawler.js
108 lines (80 loc) · 2.53 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
const colors = require('colors'),
fs = require('fs'),
Stock = require('./stockParser'),
url = require('url'),
URLS = [],
StockData = [];
hostOnly = "www.google.com",
pathOnly = "/finance",
initialPage = "https://www.google.com/finance",
Crawler = require("crawler"),
c = new Crawler({
maxConnections: 2,
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36",
// This will be called for each crawled page
callback: function(error, result, $) {
if(error)
{
console.log('ERROR'.red, error);
return;
}
if(result.body && $)
{
// the meat
const stock = Stock($);
StockData.push(stock);
// Write the stock data to a file
writeResults(StockData);
console.log('\n', result.options.uri, JSON.stringify(stock, null, 2) );
const $a = $("a");
const validLinks = [];
// crawl the links on this page
$a.each(function(index, a) {
if(a.href.indexOf("javascript:") != -1)
{
// console.log(' js link'.yellow.bold, a.href.yellow);
return;
}
const beenHereBefore = URLS.indexOf(a.href) != -1,
reqUrl = url.parse(a.href),
isRightDomain = reqUrl.hostname == hostOnly && reqUrl.pathname == pathOnly;
if(beenHereBefore){
//console.log(' ', a.href.grey);
return;
}
if(isRightDomain) {
//console.log(' ', a.href.green);
URLS.push(a.href);
validLinks.push(a.href);
}
});
c.queue(validLinks);
console.log(' ', validLinks.length, '/', $a.length, 'valid links on this page');
console.log(' ', 'total queued or visited:', URLS.length);
console.log(' ', 'captured stocks:', StockData.length);
}
else{
console.log(' response issue'.red);
}
},
});
// Queue just one URL, with default callback
c.queue(initialPage);
function writeResults(obj){
fs.writeFileSync(__dirname+'/output/stocks.json', JSON.stringify(obj, null, 2) );
}
// Queue a list of URLs
// c.queue(["http://jamendo.com/","http://tedxparis.com"]);
// Queue URLs with custom callbacks & parameters
// c.queue([{
// uri: "http://parishackers.org/",
// jQuery: false,
// // The global callback won't be called
// callback: function(error,result) {
// console.log("Grabbed",result.body.length,"bytes");
// }
// }]);
// Queue some HTML code directly without grabbing (mostly for tests)
// c.queue([{
// "html":"<p>This is a <strong>test</strong></p>"
// }]);