forked from ruipgil/scraperjs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWikimediaScraper.js
More file actions
109 lines (98 loc) · 2.33 KB
/
WikimediaScraper.js
File metadata and controls
109 lines (98 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/**
* DISCLAIMER
* This is a relatively simple example, to illustrate some of the
* possible functionalities and how to achieve them.
* There is no guarantee that this example will provide useful
* results.
* Use this example with and at your own responsibility.
*
* In this example we run through a list of links, if they have a
* route defined they will be scraped. Their title, language and
* first paragraph.
*
* To run:
* 'node WikimediaScraper.js link1 [... linkN]'
*/
var sjs = require('../../src/Scraper'),
async = require('../../node_modules/async'),
parseUrl = require('url').parse,
urls = process.argv.slice(2);
if(!urls || !urls.length) {
console.log("Usage: node WikimediaScraper.js url [...url]");
return;
}
var IMDB_SELECTOR = '[itemprop=description]',
gatheredInformation = [],
unknownRoutes = [];
var router = new sjs.Router({
firstMatch: true
});
router
.on('https?://:lang.wikipedia.org/wiki/:article')
.get()
.createStatic()
// if the status code is different from OK (200) we stop
.onStatusCode(function(statusCode, utils) {
if(statusCode!==200) {
utils.stop();
}
})
.scrape(function($) {
return {
title: $('h1').first().text(),
text: $('p').first().text()
};
})
.then(function(last, utils) {
last.lang = utils.params.lang;
return last;
});
// the same functionality than the above
var scraperForWiki = sjs.StaticScraper
.create()
.onStatusCode(function(statusCode, utils) {
if(statusCode!==200) {
utils.stop();
}
})
.scrape(function($) {
return {
title: $('h1').first().text(),
text: $('p').first().text()
};
})
.then(function(last, utils) {
if(utils.params) {
last.lang = utils.params.lang;
} else {
last.lang = "?";
}
return last;
});
router
.on(function(url) {
return parseUrl(url).host === 'en.wikiquote.com';
})
.use(scraperForWiki);
router
.on('https?://:lang.wikinews.org/wiki/:place')
.use(scraperForWiki);
router.otherwise(function(url) {
unknownRoutes.push(url);
});
async.eachLimit(urls, 2, function(url, done) {
router.route(url, function(found, returned) {
if(found && returned) {
gatheredInformation.push(returned);
}
done();
});
}, function(err) {
if(err) {
return;
}
gatheredInformation.forEach(function(item) {
console.log(item.title.toUpperCase()+" ("+item.lang+")");
console.log("\t"+item.text);
});
})