Skip to content

Commit 033cb42

Browse files
committed
Add Javadoc
Based on #61
1 parent 6b794b6 commit 033cb42

21 files changed

+548
-19
lines changed

README.md

Whitespace-only changes.

src/main/java/de/rub/nds/crawler/CommonMain.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,24 @@
1818
import org.apache.logging.log4j.LogManager;
1919
import org.apache.logging.log4j.Logger;
2020

21+
/**
22+
* Common functionality for crawler main entry points. Provides functionality to parse command line
23+
* arguments and start the worker/controller as specified by the user.
24+
*/
2125
public class CommonMain {
2226
private static final Logger LOGGER = LogManager.getLogger();
2327

28+
/**
29+
* Main entry point for the application. Uses JCommander to parse the controller/worker command
30+
* and parse the arguments into the respective configuration object. Then starts the
31+
* controller/worker.
32+
*
33+
* @param args Command line arguments
34+
* @param controllerCommandConfig Configuration for the controller. Will be filled by JCommander
35+
* from the command line if first argument is "controller".
36+
* @param workerCommandConfig Configuration for the worker. Will be filled by JCommander from
37+
* the command line if first argument is "worker".
38+
*/
2439
public static void main(
2540
String[] args,
2641
ControllerCommandConfig controllerCommandConfig,
@@ -71,6 +86,15 @@ public static void main(
7186
}
7287
}
7388

89+
/**
90+
* Convenience method to start the application with just a controller configuration. Creates a
91+
* default worker configuration. See {@link #main(String[], ControllerCommandConfig,
92+
* WorkerCommandConfig)} for details.
93+
*
94+
* @param args Command line arguments
95+
* @param controllerConfig Configuration for the controller. Will be filled by JCommander from
96+
* the command line if first argument is "controller".
97+
*/
7498
public static void main(String[] args, ControllerCommandConfig controllerConfig) {
7599
main(args, controllerConfig, new WorkerCommandConfig());
76100
}

src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626
import org.apache.commons.validator.routines.UrlValidator;
2727
import org.quartz.CronScheduleBuilder;
2828

29+
/**
30+
* Configuration class for controller instances used to parse command line parameters. Contains
31+
* settings for the controller's behavior, including scan parameters, target selection, and
32+
* notification settings. This abstract class provides the base configuration, while specific
33+
* scanner implementations must extend it to provide scanner-specific configuration.
34+
*/
2935
public abstract class ControllerCommandConfig {
3036

3137
@ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate;
@@ -123,6 +129,7 @@ public void validate() {
123129
}
124130
}
125131

132+
/** Validator that ensures parameter values are positive integers. */
126133
public static class PositiveInteger implements IParameterValidator {
127134
public void validate(String name, String value) throws ParameterException {
128135
int n = Integer.parseInt(value);
@@ -133,6 +140,7 @@ public void validate(String name, String value) throws ParameterException {
133140
}
134141
}
135142

143+
/** Validator that ensures parameter values are valid cron expressions. */
136144
public static class CronSyntax implements IParameterValidator {
137145
public void validate(String name, String value) throws ParameterException {
138146
CronScheduleBuilder.cronSchedule(value);

src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
import de.rub.nds.crawler.config.delegate.MongoDbDelegate;
1414
import de.rub.nds.crawler.config.delegate.RabbitMqDelegate;
1515

16+
/**
17+
* Configuration class for worker instances used to parse command line parameters. Contains settings
18+
* for the worker's behavior, including thread counts and timeouts, as well as MongoDB and RabbitMQ
19+
* connection settings.
20+
*/
1621
public class WorkerCommandConfig {
1722

1823
@ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate;

src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,17 @@
1010

1111
import com.beust.jcommander.Parameter;
1212

13+
/** Configuration delegate that holds MongoDB connection settings. */
1314
public class MongoDbDelegate {
1415

1516
@Parameter(
1617
names = "-mongoDbHost",
17-
description = "Host of the MongoDB instance this crawler saves to.")
18+
description = "Host of the MongoDB instance this crawler saves results to.")
1819
private String mongoDbHost;
1920

2021
@Parameter(
2122
names = "-mongoDbPort",
22-
description = "Port of the MongoDB instance this crawler saves to.")
23+
description = "Port of the MongoDB instance this crawler saves results to.")
2324
private int mongoDbPort;
2425

2526
@Parameter(
@@ -29,12 +30,12 @@ public class MongoDbDelegate {
2930

3031
@Parameter(
3132
names = "-mongoDbPass",
32-
description = "The passwort to be used to authenticate with MongoDB.")
33+
description = "The password to be used to authenticate with MongoDB.")
3334
private String mongoDbPass;
3435

3536
@Parameter(
3637
names = "-mongoDbPassFile",
37-
description = "The passwort to be used to authenticate with MongoDB.")
38+
description = "The password to be used to authenticate with MongoDB.")
3839
private String mongoDbPassFile;
3940

4041
@Parameter(

src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,31 @@
1010

1111
import com.beust.jcommander.Parameter;
1212

13+
/** Configuration delegate that holds RabbitMQ connection settings. */
1314
public class RabbitMqDelegate {
1415

15-
@Parameter(names = "-rabbitMqHost")
16+
@Parameter(names = "-rabbitMqHost", description = "Host of the RabbitMQ instance")
1617
private String rabbitMqHost;
1718

18-
@Parameter(names = "-rabbitMqPort")
19+
@Parameter(names = "-rabbitMqPort", description = "Port of the RabbitMQ instance")
1920
private int rabbitMqPort;
2021

21-
@Parameter(names = "-rabbitMqUser")
22+
@Parameter(names = "-rabbitMqUser", description = "Username for RabbitMQ authentication")
2223
private String rabbitMqUser;
2324

24-
@Parameter(names = "-rabbitMqPass")
25+
@Parameter(
26+
names = "-rabbitMqPass",
27+
description =
28+
"Password for RabbitMQ authentication. Alternatively use -rabbitMqPassFile")
2529
private String rabbitMqPass;
2630

27-
@Parameter(names = "-rabbitMqPassFile")
31+
@Parameter(
32+
names = "-rabbitMqPassFile",
33+
description =
34+
"File containing the password for RabbitMQ authentication. Alternatively use -rabbitMqPass")
2835
private String rabbitMqPassFile;
2936

30-
@Parameter(names = "-rabbitMqTLS")
37+
@Parameter(names = "-rabbitMqTLS", description = "Use TLS for the RabbitMQ connection")
3138
private boolean rabbitMqTLS;
3239

3340
public String getRabbitMqHost() {

src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
*/
99
package de.rub.nds.crawler.constant;
1010

11+
/**
12+
* Enumeration of different Crux list sizes available for scanning. Each enum constant represents a
13+
* specific list of top websites, with the value indicating the number of entries in that list.
14+
*/
1115
public enum CruxListNumber {
1216
TOP_1k(1000),
1317
TOP_5K(5000),

src/main/java/de/rub/nds/crawler/constant/JobStatus.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
*/
99
package de.rub.nds.crawler.constant;
1010

11+
/**
12+
* Enumeration of possible job status values. Indicates the current state or final result of a scan
13+
* job.
14+
*/
1115
public enum JobStatus {
1216
/** Job is waiting to be executed. */
1317
TO_BE_EXECUTED(false),
@@ -42,6 +46,11 @@ public enum JobStatus {
4246
this.isError = isError;
4347
}
4448

49+
/**
50+
* Get whether this status represents an error condition.
51+
*
52+
* @return True if this status is an error, false otherwise
53+
*/
4554
public boolean isError() {
4655
return isError;
4756
}

src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,33 @@
1212
import de.rub.nds.crawler.data.ScanTarget;
1313
import de.rub.nds.crawler.util.CanceallableThreadPoolExecutor;
1414
import de.rub.nds.scanner.core.execution.NamedThreadFactory;
15-
import java.util.concurrent.*;
15+
import java.util.concurrent.Future;
16+
import java.util.concurrent.LinkedBlockingDeque;
17+
import java.util.concurrent.ThreadPoolExecutor;
18+
import java.util.concurrent.TimeUnit;
1619
import java.util.concurrent.atomic.AtomicBoolean;
1720
import java.util.concurrent.atomic.AtomicInteger;
1821
import org.apache.logging.log4j.LogManager;
1922
import org.apache.logging.log4j.Logger;
2023
import org.bson.Document;
2124

25+
/**
26+
* A worker that scans all targets for a single scan ID. Instances are managed using the {@link
27+
* BulkScanWorkerManager}.
28+
*
29+
* @param <T> The specific ScanConfig type used by this worker
30+
*/
2231
public abstract class BulkScanWorker<T extends ScanConfig> {
2332
private static final Logger LOGGER = LogManager.getLogger();
2433
private final AtomicInteger activeJobs = new AtomicInteger(0);
2534
private final AtomicBoolean initialized = new AtomicBoolean(false);
2635
private final AtomicBoolean shouldCleanupSelf = new AtomicBoolean(false);
2736
private final Object initializationLock = new Object();
37+
38+
/** The bulk scan ID for this worker. This is unique across all workers. */
2839
protected final String bulkScanId;
40+
41+
/** The scan configuration for this worker */
2942
protected final T scanConfig;
3043

3144
/**
@@ -34,6 +47,15 @@ public abstract class BulkScanWorker<T extends ScanConfig> {
3447
*/
3548
private final ThreadPoolExecutor timeoutExecutor;
3649

50+
/**
51+
* Creates a new bulk scan worker. This should only be called by the {@link
52+
* BulkScanWorkerManager}.
53+
*
54+
* @param bulkScanId The ID of the bulk scan this worker is associated with
55+
* @param scanConfig The scan configuration for this worker
56+
* @param parallelScanThreads The number of parallel scan threads to use, i.e., how many {@link
57+
* ScanTarget}s to handle in parallel.
58+
*/
3759
protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThreads) {
3860
this.bulkScanId = bulkScanId;
3961
this.scanConfig = scanConfig;
@@ -48,6 +70,13 @@ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThread
4870
new NamedThreadFactory("crawler-worker: scan executor"));
4971
}
5072

73+
/**
74+
* Handles a scan target by submitting it to the executor. If init was not called, it will
75+
* initialize itself. In this case it will also clean up itself if all jobs are done.
76+
*
77+
* @param scanTarget The target to scan.
78+
* @return A future that resolves to the scan result once the scan is done.
79+
*/
5180
public Future<Document> handle(ScanTarget scanTarget) {
5281
// if we initialized ourself, we also clean up ourself
5382
shouldCleanupSelf.weakCompareAndSetAcquire(false, init());
@@ -62,8 +91,21 @@ public Future<Document> handle(ScanTarget scanTarget) {
6291
});
6392
}
6493

94+
/**
95+
* Scans a target and returns the result as a Document. This is the core scanning functionality
96+
* that must be implemented by subclasses.
97+
*
98+
* @param scanTarget The target to scan
99+
* @return The scan result as a Document
100+
*/
65101
public abstract Document scan(ScanTarget scanTarget);
66102

103+
/**
104+
* Initializes this worker if it hasn't been initialized yet. This method is thread-safe and
105+
* will only initialize once.
106+
*
107+
* @return True if this call performed the initialization, false if already initialized
108+
*/
67109
public final boolean init() {
68110
// synchronize such that no thread runs before being initialized
69111
// but only synchronize if not already initialized
@@ -78,6 +120,13 @@ public final boolean init() {
78120
return false;
79121
}
80122

123+
/**
124+
* Cleans up this worker if it has been initialized and has no active jobs. This method is
125+
* thread-safe and will only clean up once. If there are still active jobs, it will enqueue the
126+
* cleanup for later.
127+
*
128+
* @return True if this call performed the cleanup, false otherwise
129+
*/
81130
public final boolean cleanup() {
82131
// synchronize such that init and cleanup do not run simultaneously
83132
// but only synchronize if already initialized
@@ -99,7 +148,17 @@ public final boolean cleanup() {
99148
return false;
100149
}
101150

151+
/**
152+
* Performs the actual initialization of this worker. This method is called exactly once by
153+
* {@link #init()} when initialization is needed. Subclasses must implement this method to
154+
* initialize their specific resources.
155+
*/
102156
protected abstract void initInternal();
103157

158+
/**
159+
* Performs the actual cleanup of this worker. This method is called exactly once by {@link
160+
* #cleanup()} when cleanup is needed. Subclasses must implement this method to clean up their
161+
* specific resources.
162+
*/
104163
protected abstract void cleanupInternal();
105164
}

src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,29 @@
2222
import org.apache.logging.log4j.Logger;
2323
import org.bson.Document;
2424

25+
/**
26+
* Each ScanJob has its own BulkScanWorker. This class manages the BulkScanWorkers and ensures that
27+
* each BulkScanWorker is only created once and is cleaned up after it is not used anymore.
28+
*
29+
* <p>More concretely: If a scan with ID 1 is started, a worker is created. This worker will exist
30+
* as long as scan targets for scan ID 1 are processed. If the worker does not receive jobs for more
31+
* than 30 Minutes, it is considered stale. Upon receiving a new job, stale workers are removed.
32+
* I.e. if after 30 mins a new ID 1 job arrives, the worker persists. If a new ID 2 job arrives, a
33+
* worker for that is created, and stale workers are removed.
34+
*
35+
* <p>This class manages the mechanism above and acts as a singleton factory and manager for
36+
* BulkScanWorker instances.
37+
*/
2538
public class BulkScanWorkerManager {
2639
private static final Logger LOGGER = LogManager.getLogger();
2740
private static volatile BulkScanWorkerManager instance;
2841

42+
/**
43+
* Gets the singleton instance of the BulkScanWorkerManager. Creates the instance if it doesn't
44+
* exist yet.
45+
*
46+
* @return The singleton instance
47+
*/
2948
public static BulkScanWorkerManager getInstance() {
3049
if (instance == null) {
3150
synchronized (BulkScanWorkerManager.class) {
@@ -37,6 +56,17 @@ public static BulkScanWorkerManager getInstance() {
3756
return instance;
3857
}
3958

59+
/**
60+
* Static convenience method to handle a scan job. See also {@link #handle(ScanJobDescription,
61+
* int, int)}.
62+
*
63+
* @param scanJobDescription The scan job to handle
64+
* @param parallelConnectionThreads The number of parallel connection threads to use (used to
65+
* create worker if it does not exist)
66+
* @param parallelScanThreads The number of parallel scan threads to use (used to create worker
67+
* if it does not exist)
68+
* @return A future that returns the scan result when the target is scanned is done
69+
*/
4070
public static Future<Document> handleStatic(
4171
ScanJobDescription scanJobDescription,
4272
int parallelConnectionThreads,
@@ -62,6 +92,19 @@ private BulkScanWorkerManager() {
6292
.build();
6393
}
6494

95+
/**
96+
* Gets or creates a bulk scan worker for the specified bulk scan. Workers are cached and reused
97+
* to ensure thread limits.
98+
*
99+
* @param bulkScanId The ID of the bulk scan to get the worker for (used as key in the cache)
100+
* @param scanConfig The scan configuration (used to create worker if it does not exist)
101+
* @param parallelConnectionThreads The number of parallel connection threads to use (used to
102+
* create worker if it does not exist)
103+
* @param parallelScanThreads The number of parallel scan threads to use (used to create worker
104+
* if it does not exist)
105+
* @return A bulk scan worker for the specified bulk scan
106+
* @throws UncheckedException If a worker cannot be created
107+
*/
65108
public BulkScanWorker<?> getBulkScanWorker(
66109
String bulkScanId,
67110
ScanConfig scanConfig,
@@ -83,6 +126,17 @@ public BulkScanWorker<?> getBulkScanWorker(
83126
}
84127
}
85128

129+
/**
130+
* Handles a scan job by creating or retrieving the appropriate worker and submitting the scan
131+
* target for processing.
132+
*
133+
* @param scanJobDescription The scan job to handle
134+
* @param parallelConnectionThreads The number of parallel connection threads to use (used to
135+
* create worker if it does not exist)
136+
* @param parallelScanThreads The number of parallel scan threads to use (used to create worker
137+
* if it does not exist)
138+
* @return A future that returns the scan result when the target is scanned is done
139+
*/
86140
public Future<Document> handle(
87141
ScanJobDescription scanJobDescription,
88142
int parallelConnectionThreads,

0 commit comments

Comments
 (0)