Merge pull request #5294 from FlorentinD/gs-loss-per-iteration

FlorentinD · web-flow · commit dbdc9d7b09a7 · 2022-05-06T13:46:55.000+02:00
GraphSage return loss per iteration
diff --git a/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java b/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java
@@ -145,28 +145,22 @@ public ModelTrainResult train(Graph graph, HugeObjectArray<double[]> features) {
 
         progressTracker.endSubTask("Prepare batches");
 
-        double previousLoss = Double.MAX_VALUE;
         boolean converged = false;
-        var epochLosses = new ArrayList<Double>();
+        var iterationLossesPerEpoch = new ArrayList<List<Double>>();
 
         progressTracker.beginSubTask("Train model");
 
-        for (int epoch = 1; epoch <= epochs; epoch++) {
+        for (int epoch = 1; epoch <= epochs && !converged; epoch++) {
             progressTracker.beginSubTask("Epoch");
-
-            double newLoss = trainEpoch(batchTasks, weights);
-            epochLosses.add(newLoss);
+            var epochResult = trainEpoch(batchTasks, weights);
+            iterationLossesPerEpoch.add(epochResult.losses());
+            converged = epochResult.converged();
             progressTracker.endSubTask("Epoch");
-            if (Math.abs((newLoss - previousLoss) / previousLoss) < tolerance) {
-                converged = true;
-                break;
-            }
-            previousLoss = newLoss;
         }
 
         progressTracker.endSubTask("Train model");
 
-        return ModelTrainResult.of(epochLosses, converged, layers);
+        return ModelTrainResult.of(iterationLossesPerEpoch, converged, layers);
     }
 
     private BatchTask createBatchTask(
@@ -200,19 +194,22 @@ private BatchTask createBatchTask(
         return new BatchTask(lossFunction, weights, tolerance, progressTracker);
     }
 
-    private double trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Tensor<?>>> weights) {
+    private EpochResult trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Tensor<?>>> weights) {
         var updater = new AdamOptimizer(weights, learningRate);
 
-        double totalLoss = Double.NaN;
         int iteration = 1;
+        var iterationLosses = new ArrayList<Double>();
+        var converged = false;
+
         for (;iteration <= maxIterations; iteration++) {
             progressTracker.beginSubTask("Iteration");
 
             // run forward + maybe backward for each Batch
             ParallelUtil.runWithConcurrency(concurrency, batchTasks, executor);
-            totalLoss = batchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
+            var avgLoss = batchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
+            iterationLosses.add(avgLoss);
 
-            var converged = batchTasks.stream().allMatch(task -> task.converged);
+            converged = batchTasks.stream().allMatch(task -> task.converged);
             if (converged) {
                 progressTracker.endSubTask();
                 break;
@@ -227,12 +224,18 @@ private double trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Ten
 
             updater.update(meanGradients);
 
-            progressTracker.logMessage(formatWithLocale("LOSS: %.10f", totalLoss));
-
+            progressTracker.logMessage(formatWithLocale("LOSS: %.10f", avgLoss));
             progressTracker.endSubTask("Iteration");
         }
 
-        return totalLoss;
+        return ImmutableEpochResult.of(converged, iterationLosses);
+    }
+
+    @ValueClass
+    interface EpochResult {
+        boolean converged();
+
+        List<Double> losses();
     }
 
     static class BatchTask implements Runnable {
@@ -359,14 +362,27 @@ static GraphSageTrainMetrics empty() {
             return ImmutableGraphSageTrainMetrics.of(List.of(), false);
         }
 
-        List<Double> epochLosses();
+        @Value.Derived
+        default List<Double> epochLosses() {
+            return iterationLossPerEpoch().stream()
+                .map(iterationLosses -> iterationLosses.get(iterationLosses.size() - 1))
+                .collect(Collectors.toList());
+        }
+
+        List<List<Double>> iterationLossPerEpoch();
+
         boolean didConverge();
 
         @Value.Derived
         default int ranEpochs() {
-            return epochLosses().isEmpty()
+            return iterationLossPerEpoch().isEmpty()
                 ? 0
-                : epochLosses().size();
+                : iterationLossPerEpoch().size();
+        }
+
+        @Value.Derived
+        default List<Integer> ranIterationsPerEpoch() {
+            return iterationLossPerEpoch().stream().map(List::size).collect(Collectors.toList());
         }
 
         @Override
@@ -376,8 +392,10 @@ default Map<String, Object> toMap() {
             return Map.of(
                 "metrics", Map.of(
                     "epochLosses", epochLosses(),
+                    "iterationLossesPerEpoch", iterationLossPerEpoch(),
                     "didConverge", didConverge(),
-                    "ranEpochs", ranEpochs()
+                    "ranEpochs", ranEpochs(),
+                    "ranIterationsPerEpoch", ranIterationsPerEpoch()
             ));
         }
     }
@@ -390,13 +408,13 @@ public interface ModelTrainResult {
         Layer[] layers();
 
         static ModelTrainResult of(
-            List<Double> epochLosses,
+            List<List<Double>> iterationLossesPerEpoch,
             boolean converged,
             Layer[] layers
         ) {
             return ImmutableModelTrainResult.builder()
                 .layers(layers)
-                .metrics(ImmutableGraphSageTrainMetrics.of(epochLosses, converged))
+                .metrics(ImmutableGraphSageTrainMetrics.of(iterationLossesPerEpoch, converged))
                 .build();
         }
     }
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java
@@ -219,6 +219,7 @@ void testLosses() {
         var metrics = trainResult.metrics();
         assertThat(metrics.didConverge()).isFalse();
         assertThat(metrics.ranEpochs()).isEqualTo(10);
+        assertThat(metrics.ranIterationsPerEpoch()).containsExactly(100, 100, 100, 100, 100, 100, 100, 100, 100, 100);
 
         var metricsMap =  metrics.toMap().get("metrics");
         assertThat(metricsMap).isInstanceOf(Map.class);
@@ -266,6 +267,7 @@ void testLossesWithPoolAggregator() {
         var metrics = trainResult.metrics();
         assertThat(metrics.didConverge()).isFalse();
         assertThat(metrics.ranEpochs()).isEqualTo(10);
+        assertThat(metrics.ranIterationsPerEpoch()).containsExactly(100, 100, 100, 100, 100, 100, 100, 100, 100, 100);
 
         var metricsMap =  metrics.toMap().get("metrics");
         assertThat(metricsMap).isInstanceOf(Map.class);
@@ -301,6 +303,7 @@ void testConvergence() {
         var trainMetrics = trainResult.metrics();
         assertThat(trainMetrics.didConverge()).isTrue();
         assertThat(trainMetrics.ranEpochs()).isEqualTo(1);
+        assertThat(trainMetrics.ranIterationsPerEpoch()).containsExactly(2);
     }
 
     @ParameterizedTest
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageTrainAlgorithmFactoryTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/algo/GraphSageTrainAlgorithmFactoryTest.java
@@ -58,6 +58,7 @@
 import static org.eclipse.collections.impl.tuple.primitive.PrimitiveTuples.pair;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.params.provider.Arguments.arguments;
+import static org.neo4j.gds.assertj.Extractors.keepingFixedNumberOfDecimals;
 import static org.neo4j.gds.assertj.Extractors.removingThreadId;
 import static org.neo4j.gds.compat.TestLog.INFO;
 import static org.neo4j.gds.core.utils.mem.MemoryEstimations.RESIDENT_MEMORY;
@@ -461,13 +462,15 @@ void memoryEstimationTreeStructure(boolean isMultiLabel) {
     void testLogging() {
         var config = ImmutableGraphSageTrainConfig.builder()
             .addFeatureProperties(DUMMY_PROPERTY)
-            .embeddingDimension(64)
+            .embeddingDimension(12)
+            .aggregator(Aggregator.AggregatorType.POOL)
+            .tolerance(1e-10)
+            .sampleSizes(List.of(5, 3))
+            .batchSize(5)
+            .randomSeed(42L)
             .modelName("model")
             .epochs(2)
             .maxIterations(2)
-            .tolerance(1e-10)
-            .learningRate(0.001)
-            .randomSeed(42L)
             .build();
 
         var log = Neo4jProxy.testLog();
@@ -485,6 +488,7 @@ void testLogging() {
         AssertionsForInterfaceTypes.assertThat(messagesInOrder)
             // avoid asserting on the thread id
             .extracting(removingThreadId())
+            .extracting(keepingFixedNumberOfDecimals(2))
             .containsExactly(
                 "GraphSageTrain :: Start",
                 "GraphSageTrain :: Prepare batches :: Start",
@@ -493,17 +497,23 @@ void testLogging() {
                 "GraphSageTrain :: Train model :: Start",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Start",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 1 of 2 :: Start",
-                "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 1 of 2 :: LOSS: 531.5699087433",
+                "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 1 of 2 :: LOSS: 132.63",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 1 of 2 100%",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 1 of 2 :: Finished",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 2 of 2 :: Start",
+                "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 2 of 2 :: LOSS: 129.13",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 2 of 2 100%",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Iteration 2 of 2 :: Finished",
                 "GraphSageTrain :: Train model :: Epoch 1 of 2 :: Finished",
                 "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Start",
                 "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 1 of 2 :: Start",
+                "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 1 of 2 :: LOSS: 123.38",
                 "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 1 of 2 100%",
                 "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 1 of 2 :: Finished",
+                "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 2 of 2 :: Start",
+                "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 2 of 2 :: LOSS: 116.06",
+                "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 2 of 2 100%",
+                "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Iteration 2 of 2 :: Finished",
                 "GraphSageTrain :: Train model :: Epoch 2 of 2 :: Finished",
                 "GraphSageTrain :: Train model :: Finished",
                 "GraphSageTrain :: Finished"
diff --git a/doc/asciidoc/machine-learning/node-embeddings/graph-sage/graph-sage.adoc b/doc/asciidoc/machine-learning/node-embeddings/graph-sage/graph-sage.adoc
@@ -278,7 +278,7 @@ RETURN
 .Results
 |===
 | modelName           | didConverge | ranEpochs | epochLosses
-| "exampleTrainModel" | false       | 1         | [186.04946807210226]
+| "exampleTrainModel" | true        | 1         | [186.04946807210226]
 |===
 --
 
diff --git a/doc/asciidoc/machine-learning/node-embeddings/graph-sage/specific-train-configuration.adoc b/doc/asciidoc/machine-learning/node-embeddings/graph-sage/specific-train-configuration.adoc
@@ -10,10 +10,10 @@
 | sampleSizes                                                                      | List of Integer | [25, 10]  | yes      | A list of Integer values, the size of the list determines the number of layers and the values determine how many nodes will be sampled by the layers.
 | projectedFeatureDimension                                                        | Integer       | n/a       | yes      | The dimension of the projected `featureProperties`. This enables multi-label GraphSage, where each label can have a subset of the `featureProperties`.
 | batchSize                                                                        | Integer       | 100       | yes      | The number of nodes per batch.
-| <<common-configuration-tolerance,tolerance>>                                     | Float         | 1e-4      | yes      | Tolerance used for the early convergence of an epoch.
+| <<common-configuration-tolerance,tolerance>>                                     | Float         | 1e-4      | yes      | Tolerance used for the early convergence of an epoch, which is checked after each iteration.
 | learningRate                                                                     | Float         | 0.1       | yes      | The learning rate determines the step size at each iteration while moving toward a minimum of a loss function.
 | epochs                                                                           | Integer       | 1         | yes      | Number of times to traverse the graph.
-| <<common-configuration-max-iterations,maxIterations>>                            | Integer       | 10        | yes      | Maximum number of weight updates per batch. Batches can also converge early based on `tolerance`.
+| <<common-configuration-max-iterations,maxIterations>>                            | Integer       | 10        | yes      | Maximum number of iterations per epoch. Each iteration the weights are updated.
 | searchDepth                                                                      | Integer       | 5         | yes      | Maximum depth of the RandomWalks to sample nearby nodes for the training.
 | negativeSampleWeight                                                             | Integer       | 20        | yes      | The weight of the negative samples. Higher values increase the impact of negative samples in the loss.
 | <<common-configuration-relationship-weight-property,relationshipWeightProperty>> | String        | null      | yes      | Name of the relationship property to use as weights. If unspecified, the algorithm runs unweighted.