-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathController.java
More file actions
51 lines (44 loc) · 1.88 KB
/
Controller.java
File metadata and controls
51 lines (44 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
/**
*
* @author Akshay
*
*/
public class Controller {
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "C:\\My files\\Information Retrieval\\Crawler Data\\CrawlerDataTempJamendo";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setMaxDepthOfCrawling(0);
config.setMaxPagesToFetch(50);
config.setResumableCrawling(true);
/*
* Be polite: Make sure that we don't send more than 1 request per
* second (1000 milliseconds between requests).
*/
config.setPolitenessDelay(1000);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("https://www.jamendo.com/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(CrawlerDemo.class, numberOfCrawlers);
}
}