Consider only #concurrency batches per iteration

FlorentinD · FlorentinD · commit f59afeafd55e · 2022-05-10T12:32:15.000+02:00
Drastically lowers the runtime of the algorithm and even gives better quality in most cases.
Reasoning: Before we averaged the gradient over all batches which was just noisy. By using an approximate gradient we still are close enough and save a lot of time.
diff --git a/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java b/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java
@@ -57,7 +57,9 @@
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.Function;
+import java.util.function.Supplier;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 import java.util.stream.LongStream;
 
 import static org.neo4j.gds.embeddings.graphsage.GraphSageHelper.embeddingsComputationGraph;
@@ -148,12 +150,17 @@ public ModelTrainResult train(Graph graph, HugeObjectArray<double[]> features) {
         boolean converged = false;
         var iterationLossesPerEpoch = new ArrayList<List<Double>>();
 
+        var prevEpochLoss = Double.NaN;
+        var random = new Random(randomSeed);
+
         progressTracker.beginSubTask("Train model");
 
         for (int epoch = 1; epoch <= epochs && !converged; epoch++) {
             progressTracker.beginSubTask("Epoch");
-            var epochResult = trainEpoch(batchTasks, weights);
-            iterationLossesPerEpoch.add(epochResult.losses());
+            var epochResult = trainEpoch(() -> batchTasks.get(random.nextInt(batchTasks.size())), weights, prevEpochLoss);
+            List<Double> epochLosses = epochResult.losses();
+            iterationLossesPerEpoch.add(epochLosses);
+            prevEpochLoss = epochLosses.get(epochLosses.size() - 1);
             converged = epochResult.converged();
             progressTracker.endSubTask("Epoch");
         }
@@ -194,27 +201,37 @@ private BatchTask createBatchTask(
         return new BatchTask(lossFunction, weights, tolerance, progressTracker);
     }
 
-    private EpochResult trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Tensor<?>>> weights) {
+    private EpochResult trainEpoch(Supplier<BatchTask> batchTaskSupplier, List<Weights<? extends Tensor<?>>> weights, double prevEpochLoss) {
         var updater = new AdamOptimizer(weights, learningRate);
 
         int iteration = 1;
         var iterationLosses = new ArrayList<Double>();
+        double prevLoss = prevEpochLoss;
         var converged = false;
 
         for (;iteration <= maxIterations; iteration++) {
             progressTracker.beginSubTask("Iteration");
 
+            // TODO let the user configer the number of batches per iteration
+            var batchTasks = IntStream
+                .range(0, concurrency)
+                .mapToObj(__ -> batchTaskSupplier.get())
+                .collect(Collectors.toList());
+
             // run forward + maybe backward for each Batch
             ParallelUtil.runWithConcurrency(concurrency, batchTasks, executor);
             var avgLoss = batchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
             iterationLosses.add(avgLoss);
+            progressTracker.logMessage(formatWithLocale("LOSS: %.10f", avgLoss));
 
-            converged = batchTasks.stream().allMatch(task -> task.converged);
-            if (converged) {
-                progressTracker.endSubTask();
+            if (Math.abs(prevLoss - avgLoss) < tolerance) {
+                converged = true;
+                progressTracker.endSubTask("Iteration");
                 break;
             }
 
+            prevLoss = avgLoss;
+
             var batchedGradients = batchTasks
                 .stream()
                 .map(BatchTask::weightGradients)
@@ -223,8 +240,6 @@ private EpochResult trainEpoch(List<BatchTask> batchTasks, List<Weights<? extend
             var meanGradients = averageTensors(batchedGradients);
 
             updater.update(meanGradients);
-
-            progressTracker.logMessage(formatWithLocale("LOSS: %.10f", avgLoss));
             progressTracker.endSubTask("Iteration");
         }
 
@@ -245,7 +260,6 @@ static class BatchTask implements Runnable {
         private List<? extends Tensor<?>> weightGradients;
         private final double tolerance;
         private final ProgressTracker progressTracker;
-        private boolean converged;
         private double prevLoss;
 
         BatchTask(
@@ -262,14 +276,9 @@ static class BatchTask implements Runnable {
 
         @Override
         public void run() {
-            if(converged) { // Don't try to go further
-                return;
-            }
-
             var localCtx = new ComputationContext();
             var loss = localCtx.forward(lossFunction).value();
 
-            converged = Math.abs(prevLoss - loss) < tolerance;
             prevLoss = loss;
 
             localCtx.backward(lossFunction);
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java
@@ -228,17 +228,17 @@ void testLosses() {
         assertThat(epochLosses).isInstanceOf(List.class);
         assertThat(((List<Double>) epochLosses).stream().mapToDouble(Double::doubleValue).toArray())
             .contains(new double[]{
-                    91.33327272,
-                    88.17940500,
-                    87.68340477,
-                    85.60797746,
-                    85.59108701,
-                    85.59007234,
-                    81.44403525,
-                    81.44260858,
-                    81.44349342,
-                    81.45612978
-                }, Offset.offset(1e-8)
+                78.30,
+                71.55,
+                71.07,
+                71.65,
+                74.36,
+                74.08,
+                73.98,
+                80.28,
+                71.07,
+                71.07
+                }, Offset.offset(0.05)
             );
     }
 
@@ -276,16 +276,16 @@ void testLossesWithPoolAggregator() {
         assertThat(epochLosses).isInstanceOf(List.class);
         assertThat(((List<Double>) epochLosses).stream().mapToDouble(Double::doubleValue).toArray())
             .contains(new double[]{
-                    90.53,
-                    83.29,
-                    74.75,
-                    74.61,
-                    74.68,
-                    74.54,
-                    74.46,
-                    74.47,
-                    74.41,
-                    74.41
+                87.34,
+                80.75,
+                74.07,
+                93.12,
+                96.36,
+                80.50,
+                77.31,
+                99.70,
+                83.60,
+                83.60
                 }, Offset.offset(0.05)
             );
     }
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageTrainAlgorithmFactoryTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageTrainAlgorithmFactoryTest.java
@@ -460,15 +460,17 @@ void memoryEstimationTreeStructure(boolean isMultiLabel) {
 
     @Test
     void testLogging() {
-        var config = ImmutableGraphSageTrainConfig.builder()
-            .addFeatureProperties(DUMMY_PROPERTY)
+        var config = GraphSageTrainConfigImpl.builder()
+            .username("DUMMY")
+            .featureProperties(List.of(DUMMY_PROPERTY))
             .embeddingDimension(12)
             .aggregator(Aggregator.AggregatorType.POOL)
             .tolerance(1e-10)
             .sampleSizes(List.of(5, 3))
             .batchSize(5)
             .randomSeed(42L)
             .modelName("model")
+            .activationFunction("RELU")
             .epochs(2)
             .maxIterations(2)
             .build();
diff --git a/doc/asciidoc/machine-learning/node-embeddings/graph-sage/graph-sage.adoc b/doc/asciidoc/machine-learning/node-embeddings/graph-sage/graph-sage.adoc
@@ -278,7 +278,7 @@ RETURN
 .Results
 |===
 | modelName           | didConverge | ranEpochs | epochLosses
-| "exampleTrainModel" | true        | 1         | [186.04946807210226]
+| "exampleTrainModel" | true        | 1         | [186.0494680638198]
 |===
 --
 
@@ -504,13 +504,13 @@ YIELD nodeId, embedding
 .Results
 |===
 | nodeId | embedding
-| 0      | [0.528500243954147, 0.46821819122905217, 0.7081378518617193]
-| 1      | [0.5285002439545966, 0.4682181912292858, 0.7081378518612291]
-| 2      | [0.5285002439541305, 0.4682181912290437, 0.7081378518617372]
-| 3      | [0.528500243952747, 0.46821819122832464, 0.7081378518632452]
-| 4      | [0.5285002439970667, 0.46821819125135444, 0.7081378518149409]
-| 5      | [0.5285002440594959, 0.46821819128379416, 0.7081378517468996]
-| 6      | [0.528500243952941, 0.46821819122842556, 0.7081378518630335]
+| 0      | [0.5285002294775042, 0.46821819621782496, 0.7081378593674258]
+| 1      | [0.5285002294779538, 0.4682181962180586, 0.7081378593669356]
+| 2      | [0.5285002294774878, 0.46821819621781646, 0.7081378593674437]
+| 3      | [0.5285002294761042, 0.4682181962170975, 0.7081378593689517]
+| 4      | [0.5285002295204241, 0.4682181962401272, 0.7081378593206474]
+| 5      | [0.528500229582853, 0.468218196272567, 0.7081378592526062]
+| 6      | [0.5285002294762983, 0.4682181962171984, 0.7081378593687399]
 |===
 --