-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCrawler.js
More file actions
107 lines (100 loc) · 2.64 KB
/
Crawler.js
File metadata and controls
107 lines (100 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
let fileHandler = require("./fileHandler");
let rp = require("request-promise");
let WikiPage = require("./WikiPage");
const LINK_CAP = 15;
const BASE_URL = "https://en.wikipedia.org";
class Crawler {
constructor(pages) {
this.pages = pages;
this.visited = new Set();
}
crawl() {
let links = new Set();
this.pages.forEach(page => {
page.root = page.title;
fileHandler.createDirs(page)
.then((dir) => {
page.setRootDir(dir);
this.scrape(page)
.then(() => {
this.crawlLinks(page)
.then((data) => {
links.add(data);
fileHandler.savePage(page)
.then()
.catch(err => {
console.log(err);
});
})
.catch((err) => {
console.log(err);
});
})
.catch((err) => {
console.log(err);
});
})
.catch(err => {
console.log(err);
})
});
}
scrape(page) {
return new Promise((resolve, reject) => {
rp(BASE_URL + page.path)
.then((htmlString) => {
page.setContent(htmlString)
this.extractInternalLinks(page);
resolve();
})
.catch((err) => {
reject(err);
});
});
}
extractInternalLinks(page) {
let p = /href=\"\/wiki\/(?!.*:)(.*?)\"/gi;
let matches = page.html.match(p);
let links = new Set();
matches.forEach(match => {
links.add(match.substring(6, match.lastIndexOf('"')));
});
page.links = links;
}
crawlLinks(root) {
let promises = [];
let depth = 2;
let links = root.links;
let pageLinks = new Set(links);
for (let i = 0; i < depth; i++) {
pageLinks.forEach(link => {
if (!this.visited.has(link)) {
this.visited.add(link);
let page = new WikiPage(link);
page.root = root.title;
page.setRootDir(root.rootDir);
promises.push(new Promise((resolve, reject) => {
this.scrape(page)
.then(() => {
links.add(page.links);
fileHandler.savePage(page)
.then()
.catch(err => {
reject(err);
});
resolve();
})
.catch(err => {
reject(err);
});
}));
}
});
pageLinks = new Set(links);
}
return Promise.all(promises).then(() => {
return Promise.resolve(links);
});
}
}
module.exports = Crawler;