Return loss per iteration

FlorentinD · FlorentinD · commit 4ee287861f32 · 2022-05-05T16:53:54.000+02:00
For more fine-tuning
diff --git a/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java b/algo/src/main/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainer.java
@@ -147,15 +147,18 @@ public ModelTrainResult train(Graph graph, HugeObjectArray<double[]> features) {
 
         double previousLoss = Double.MAX_VALUE;
         boolean converged = false;
-        var epochLosses = new ArrayList<Double>();
+        var iterationLossesPerEpoch = new ArrayList<List<Double>>();
 
         progressTracker.beginSubTask("Train model");
 
         for (int epoch = 1; epoch <= epochs; epoch++) {
             progressTracker.beginSubTask("Epoch");
 
-            double newLoss = trainEpoch(batchTasks, weights);
-            epochLosses.add(newLoss);
+
+            var iterationLosses = trainEpoch(batchTasks, weights);
+            iterationLossesPerEpoch.add(iterationLosses);
+            var newLoss = iterationLosses.get(iterationLosses.size() - 1);
+
             progressTracker.endSubTask("Epoch");
             if (Math.abs((newLoss - previousLoss) / previousLoss) < tolerance) {
                 converged = true;
@@ -166,7 +169,7 @@ public ModelTrainResult train(Graph graph, HugeObjectArray<double[]> features) {
 
         progressTracker.endSubTask("Train model");
 
-        return ModelTrainResult.of(epochLosses, converged, layers);
+        return ModelTrainResult.of(iterationLossesPerEpoch, converged, layers);
     }
 
     private BatchTask createBatchTask(
@@ -200,17 +203,18 @@ private BatchTask createBatchTask(
         return new BatchTask(lossFunction, weights, tolerance, progressTracker);
     }
 
-    private double trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Tensor<?>>> weights) {
+    private List<Double> trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Tensor<?>>> weights) {
         var updater = new AdamOptimizer(weights, learningRate);
 
-        double totalLoss = Double.NaN;
         int iteration = 1;
+        var iterationLosses = new ArrayList<Double>();
         for (;iteration <= maxIterations; iteration++) {
             progressTracker.beginSubTask("Iteration");
 
             // run forward + maybe backward for each Batch
             ParallelUtil.runWithConcurrency(concurrency, batchTasks, executor);
-            totalLoss = batchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
+            var avgLoss = batchTasks.stream().mapToDouble(BatchTask::loss).average().orElseThrow();
+            iterationLosses.add(avgLoss);
 
             var converged = batchTasks.stream().allMatch(task -> task.converged);
             if (converged) {
@@ -227,12 +231,11 @@ private double trainEpoch(List<BatchTask> batchTasks, List<Weights<? extends Ten
 
             updater.update(meanGradients);
 
-            progressTracker.logMessage(formatWithLocale("LOSS: %.10f", totalLoss));
-
+            progressTracker.logMessage(formatWithLocale("LOSS: %.10f", avgLoss));
             progressTracker.endSubTask("Iteration");
         }
 
-        return totalLoss;
+        return iterationLosses;
     }
 
     static class BatchTask implements Runnable {
@@ -359,14 +362,27 @@ static GraphSageTrainMetrics empty() {
             return ImmutableGraphSageTrainMetrics.of(List.of(), false);
         }
 
-        List<Double> epochLosses();
+        @Value.Derived
+        default List<Double> epochLosses() {
+            return iterationLossPerEpoch().stream()
+                .map(iterationLosses -> iterationLosses.get(iterationLosses.size() - 1))
+                .collect(Collectors.toList());
+        }
+
+        List<List<Double>> iterationLossPerEpoch();
+
         boolean didConverge();
 
         @Value.Derived
         default int ranEpochs() {
-            return epochLosses().isEmpty()
+            return iterationLossPerEpoch().isEmpty()
                 ? 0
-                : epochLosses().size();
+                : iterationLossPerEpoch().size();
+        }
+
+        @Value.Derived
+        default List<Integer> ranIterationsPerEpoch() {
+            return iterationLossPerEpoch().stream().map(List::size).collect(Collectors.toList());
         }
 
         @Override
@@ -376,8 +392,10 @@ default Map<String, Object> toMap() {
             return Map.of(
                 "metrics", Map.of(
                     "epochLosses", epochLosses(),
+                    "iterationLossesPerEpoch", iterationLossPerEpoch(),
                     "didConverge", didConverge(),
-                    "ranEpochs", ranEpochs()
+                    "ranEpochs", ranEpochs(),
+                    "ranIterationsPerEpoch", ranIterationsPerEpoch()
             ));
         }
     }
@@ -390,13 +408,13 @@ public interface ModelTrainResult {
         Layer[] layers();
 
         static ModelTrainResult of(
-            List<Double> epochLosses,
+            List<List<Double>> iterationLossesPerEpoch,
             boolean converged,
             Layer[] layers
         ) {
             return ImmutableModelTrainResult.builder()
                 .layers(layers)
-                .metrics(ImmutableGraphSageTrainMetrics.of(epochLosses, converged))
+                .metrics(ImmutableGraphSageTrainMetrics.of(iterationLossesPerEpoch, converged))
                 .build();
         }
     }
diff --git a/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java b/algo/src/test/java/org/neo4j/gds/embeddings/graphsage/GraphSageModelTrainerTest.java
@@ -219,6 +219,7 @@ void testLosses() {
         var metrics = trainResult.metrics();
         assertThat(metrics.didConverge()).isFalse();
         assertThat(metrics.ranEpochs()).isEqualTo(10);
+        assertThat(metrics.ranIterationsPerEpoch()).containsExactly(100, 100, 100, 100, 100, 100, 100, 100, 100, 100);
 
         var metricsMap =  metrics.toMap().get("metrics");
         assertThat(metricsMap).isInstanceOf(Map.class);
@@ -266,6 +267,7 @@ void testLossesWithPoolAggregator() {
         var metrics = trainResult.metrics();
         assertThat(metrics.didConverge()).isFalse();
         assertThat(metrics.ranEpochs()).isEqualTo(10);
+        assertThat(metrics.ranIterationsPerEpoch()).containsExactly(100, 100, 100, 100, 100, 100, 100, 100, 100, 100);
 
         var metricsMap =  metrics.toMap().get("metrics");
         assertThat(metricsMap).isInstanceOf(Map.class);
@@ -301,6 +303,7 @@ void testConvergence() {
         var trainMetrics = trainResult.metrics();
         assertThat(trainMetrics.didConverge()).isTrue();
         assertThat(trainMetrics.ranEpochs()).isEqualTo(1);
+        assertThat(trainMetrics.ranIterationsPerEpoch()).containsExactly(2);
     }
 
     @ParameterizedTest