Merge pull request #5236 from FlorentinD/fix-gradients-apply-self-gradient

FlorentinD · web-flow · commit 93473ff48907 · 2022-04-26T17:03:48.000+02:00
Correct gradients by apply the self gradient
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/ComputationContext.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/ComputationContext.java
@@ -20,7 +20,7 @@
 package org.neo4j.gds.ml.core;
 
 import org.jetbrains.annotations.TestOnly;
-import org.neo4j.gds.ml.core.functions.PassthroughVariable;
+import org.neo4j.gds.ml.core.functions.SingleParentVariable;
 import org.neo4j.gds.ml.core.tensor.Tensor;
 
 import java.util.HashMap;
@@ -70,7 +70,7 @@ public void backward(Variable<?> function) {
 
         gradients.clear();
         Queue<BackPropTask> executionQueue = new LinkedBlockingQueue<>();
-        PassthroughVariable<?> dummy = new PassthroughVariable<>(function);
+        var dummy = new PassthroughVariable<>(function);
         executionQueue.add(new BackPropTask(function, dummy));
         Map<Variable<?>, AtomicInteger> upstreamCounters = new HashMap<>();
         initUpstream(dummy, upstreamCounters);
@@ -169,4 +169,25 @@ static class BackPropTask {
         }
     }
 
+    private static class PassthroughVariable<T extends Tensor<T>> extends SingleParentVariable<T, T> {
+
+        public PassthroughVariable(Variable<T> parent) {
+            super(parent, parent.dimensions());
+
+            if (parent instanceof PassthroughVariable) {
+                throw new IllegalArgumentException("Redundant use of PassthroughVariables. Chaining does not make sense.");
+            }
+        }
+
+        @Override
+        public T apply(ComputationContext ctx) {
+            return ctx.data(parent);
+        }
+
+        @Override
+        public T gradientForParent(ComputationContext ctx) {
+            // initialize gradient computation with `1`
+            return ctx.data(parent).map(v -> 1);
+        }
+    }
 }
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/CrossEntropyLoss.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/CrossEntropyLoss.java
@@ -71,23 +71,19 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
             Matrix gradient = predictionsMatrix.createWithSameDimensions();
             var targetsVector = ctx.data(targets);
 
-            var multiplier = -1.0 / gradient.rows();
+            var multiplier = - ctx.gradient(this).value() / gradient.rows();
             for (int row = 0; row < gradient.rows(); row++) {
                 var trueClass = (int) targetsVector.dataAt(row);
                 var predictedProbabilityForTrueClass = predictionsMatrix.dataAt(row * predictionsMatrix.cols() + trueClass);
 
                 // Compare to a threshold value rather than `0`, very small probability can result in setting infinite gradient values.
                 if (predictedProbabilityForTrueClass > PREDICTED_PROBABILITY_THRESHOLD) {
-                    gradient.setDataAt(
-                        row * predictionsMatrix.cols() + trueClass,
-                        multiplier / predictedProbabilityForTrueClass
-                    );
+                    gradient.setDataAt(row, trueClass, multiplier / predictedProbabilityForTrueClass);
                 }
             }
             return gradient;
         } else {
-            // targets should never require a gradient
-            return ctx.data(parent).createWithSameDimensions();
+            throw new IllegalStateException("The gradient should not be necessary for the targets. But got: " + targets.render());
         }
     }
 }
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/EWiseAddMatrixScalar.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/EWiseAddMatrixScalar.java
@@ -52,10 +52,11 @@ public Matrix apply(ComputationContext ctx) {
 
     @Override
     public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
+        Matrix selfGradient = ctx.gradient(this);
         if (parent == matrixVariable) {
-            return ctx.gradient(this);
+            return selfGradient;
         } else {
-            return new Scalar(ctx.gradient(this).aggregateSum());
+            return new Scalar(selfGradient.aggregateSum());
         }
     }
 }
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/ElementSum.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/ElementSum.java
@@ -45,6 +45,7 @@ public Scalar apply(ComputationContext ctx) {
 
     @Override
     public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
-        return ctx.data(parent).map(ignore -> ctx.gradient(this).value());
+        double selfGradient = ctx.gradient(this).value();
+        return ctx.data(parent).map(ignore -> selfGradient);
     }
 }
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/L2NormSquared.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/L2NormSquared.java
@@ -54,6 +54,7 @@ public Scalar apply(ComputationContext ctx) {
 
     @Override
     public Matrix gradientForParent(ComputationContext ctx) {
-        return ctx.data(parent).copy().scalarMultiply(2 * ctx.gradient(this).value());
+        double selfGradient = ctx.gradient(this).value();
+        return ctx.data(parent).scalarMultiply(2 * selfGradient);
     }
 }
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/LogisticLoss.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/LogisticLoss.java
@@ -128,6 +128,8 @@ else if (predicted == 1.0) {
 
     @Override
     public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
+        var selfGradient = ctx.gradient(this).value();
+
         if (parent == weights) {
             ctx.forward(predictions);
             var predVector = ctx.data(predictions);
@@ -141,7 +143,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
             for (int idx = 0; idx < numberOfExamples; idx++) {
                 double errorPerExample = (predVector.dataAt(idx) - targetVector.dataAt(idx)) / numberOfExamples;
                 for (int feature = 0; feature < featureCount; feature++) {
-                    gradient.addDataAt(feature, errorPerExample * featuresTensor.dataAt(idx, feature));
+                    gradient.addDataAt(feature, selfGradient * errorPerExample * featuresTensor.dataAt(idx, feature));
                 }
             }
             return gradient;
@@ -154,13 +156,14 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
 
             for (int idx = 0; idx < numberOfExamples; idx++) {
                 double errorPerExample = (predVector.dataAt(idx) - targetVector.dataAt(idx));
-                gradient.addDataAt(0, errorPerExample);
+                gradient.addDataAt(0, selfGradient * errorPerExample);
             }
 
             return gradient.scalarMultiplyMutate(1.0D / numberOfExamples);
         } else {
-            // assume feature and target variables do not require gradient
-            return ctx.data(parent).createWithSameDimensions();
+            throw new IllegalStateException(
+                "The gradient should only be computed for the bias and the weights parents, but got " + parent.render()
+            );
         }
     }
 
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/PassthroughVariable.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/PassthroughVariable.java
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLoss.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLoss.java
@@ -69,6 +69,7 @@ public static long sizeInBytes() {
 
     @Override
     public Scalar apply(ComputationContext ctx) {
+        // manually call forward as `predictions` is not registered as a parent
         var predictionsMatrix = ctx.forward(predictions);
         var labelsVector = ctx.data(labels);
 
@@ -85,9 +86,13 @@ public Scalar apply(ComputationContext ctx) {
 
     @Override
     public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
+        // manually call forward as `predictions` is not registered as a parent
         var predMatrix = ctx.forward(predictions);
         var labelsVector = ctx.data(labels);
         int numberOfExamples = labelsVector.length();
+
+        var selfGradient = ctx.gradient(this).value();
+
         if (parent == weights) {
             var weightsMatrix = ctx.data(weights);
             var featureMatrix = ctx.data(features);
@@ -102,7 +107,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
                     var indicatorIsTrueClass = trueClass == classIdx ? 1.0 : 0.0;
                     var errorPerExample = (predictedClassProbability - indicatorIsTrueClass) / numberOfExamples;
                     for (int feature = 0; feature < featureCount; feature++) {
-                        gradient.addDataAt(classIdx, feature, errorPerExample * featureMatrix.dataAt(row, feature));
+                        gradient.addDataAt(classIdx, feature, selfGradient * errorPerExample * featureMatrix.dataAt(row, feature));
                     }
                 }
             }
@@ -118,7 +123,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
                     double predictedClassProbability = predMatrix.dataAt(row, classIdx);
                     var indicatorIsTrueClass = trueClass == classIdx ? 1.0 : 0.0;
                     var errorPerExample = (predictedClassProbability - indicatorIsTrueClass) / numberOfExamples;
-                    gradient.addDataAt(classIdx, errorPerExample);
+                    gradient.addDataAt(classIdx, selfGradient * errorPerExample);
                 }
             }
             return gradient;
diff --git a/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/CrossEntropyLossTest.java b/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/CrossEntropyLossTest.java
@@ -76,8 +76,28 @@ void shouldComputeGradientCorrectly() {
     }
 
     @Test
-    void infiniteSmallProbabilities() {
+    void considerSelfGradient() {
+        var targets = Constant.vector(new double[]{1.0, 2.0, 0.0});
+        var predictions = new Weights<>(
+            new Matrix(
+                new double[]{
+                    0.35, 0.65, 0.0,
+                    0.45, 0.45, 0.1,
+                    0.14, 0.66, 0.2
+                },
+                3, 3
+            )
+        );
 
+        var loss = new CrossEntropyLoss(predictions, targets);
+        var chainedLoss = new Sigmoid<>(loss);
+
+        finiteDifferenceShouldApproximateGradient(predictions, chainedLoss);
+    }
+
+
+    @Test
+    void infiniteSmallProbabilities() {
         var predictions = new Weights<>(new Matrix(new double[]{5.277E-321, 5.277E-321}, 1, 2));
         var targets = Constant.vector(new double[]{1});
 
diff --git a/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/LogisticLossTest.java b/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/LogisticLossTest.java
@@ -20,6 +20,7 @@
 package org.neo4j.gds.ml.core.functions;
 
 import org.assertj.core.data.Offset;
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.neo4j.gds.ml.core.ComputationContext;
@@ -71,6 +72,21 @@ void logisticLossApproximatesGradient(boolean withBias) {
         finiteDifferenceShouldApproximateGradient(weights, loss);
     }
 
+    @Test
+    void considerSelfGradient() {
+        var features = Constant.matrix(new double[]{0.23, 0.52, 0.62, 0.32, 0.64, 0.71}, 2, 3);
+        var targets = Constant.vector(new double[]{1.0, 0.0});
+        var weights = new Weights<>(new Matrix(new double[]{0.35, 0.41, 1.0}, 1, 3));
+        var bias = Weights.ofScalar(2.5);
+
+        var predictions = new Sigmoid<>(new MatrixMultiplyWithTransposedSecondOperand(features, weights));
+
+        var loss = new LogisticLoss(weights, bias, predictions, features, targets);
+        var chainedLoss = new Sigmoid<>(loss);
+
+        finiteDifferenceShouldApproximateGradient(weights, chainedLoss);
+    }
+
     @Override
     public double epsilon() {
         return 1e-7;
diff --git a/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLossTest.java b/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLossTest.java
@@ -172,6 +172,35 @@ void shouldComputeGradientCorrectlyStandard() {
         finiteDifferenceShouldApproximateGradient(List.of(bias, weights), loss);
     }
 
+    @Test
+    void considerSelfGradient() {
+        var features = Constant.matrix(
+            new double[]{0.23, 0.52, 0.62, 0.32, 0.64, 0.71, 0.29, -0.52, 0.12, -0.92, 0.6, -0.11},
+            3,
+            4
+        );
+        var labels = Constant.vector(new double[]{1.0, 0.0, 2.0});
+
+        var weights = new Weights<>(new Matrix(new double[]{0.35, 0.41, 1.0, 0.1, 0.54, 0.12, 0.81, 0.7}, 2, 4));
+        var bias = Weights.ofVector(0.37, 0.37);
+
+        var weightedFeatures = new MatrixMultiplyWithTransposedSecondOperand(features, weights);
+        var affineVariable = new MatrixVectorSum(weightedFeatures, bias);
+
+        var predictions = new ReducedSoftmax(affineVariable);
+
+        var loss = new ReducedCrossEntropyLoss(
+            predictions,
+            weights,
+            bias,
+            features,
+            labels
+        );
+        var chainedLoss = new Sigmoid<>(loss);
+
+        finiteDifferenceShouldApproximateGradient(List.of(bias, weights), chainedLoss);
+    }
+
     @Override
     public double epsilon() {
         return 1e-7;

Original file line number	Diff line number	Diff line change
`@@ -71,23 +71,19 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {`
`71`	`71`	`Matrix gradient = predictionsMatrix.createWithSameDimensions();`
`72`	`72`	`var targetsVector = ctx.data(targets);`
`73`	`73`
`74`		`- var multiplier = -1.0 / gradient.rows();`
	`74`	`+ var multiplier = - ctx.gradient(this).value() / gradient.rows();`
`75`	`75`	`for (int row = 0; row < gradient.rows(); row++) {`
`76`	`76`	`var trueClass = (int) targetsVector.dataAt(row);`
`77`	`77`	`var predictedProbabilityForTrueClass = predictionsMatrix.dataAt(row * predictionsMatrix.cols() + trueClass);`
`78`	`78`
`79`	`79`	// Compare to a threshold value rather than `0`, very small probability can result in setting infinite gradient values.
`80`	`80`	`if (predictedProbabilityForTrueClass > PREDICTED_PROBABILITY_THRESHOLD) {`
`81`		`- gradient.setDataAt(`
`82`		`- row * predictionsMatrix.cols() + trueClass,`
`83`		`- multiplier / predictedProbabilityForTrueClass`
`84`		`- );`
	`81`	`+ gradient.setDataAt(row, trueClass, multiplier / predictedProbabilityForTrueClass);`
`85`	`82`	`}`
`86`	`83`	`}`
`87`	`84`	`return gradient;`
`88`	`85`	`} else {`
`89`		`- // targets should never require a gradient`
`90`		`- return ctx.data(parent).createWithSameDimensions();`
	`86`	`+ throw new IllegalStateException("The gradient should not be necessary for the targets. But got: " + targets.render());`
`91`	`87`	`}`
`92`	`88`	`}`
`93`	`89`	`}`
Original file line number	Diff line number	Diff line change
`@@ -52,10 +52,11 @@ public Matrix apply(ComputationContext ctx) {`
`52`	`52`
`53`	`53`	`@Override`
`54`	`54`	`public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {`
	`55`	`+ Matrix selfGradient = ctx.gradient(this);`
`55`	`56`	`if (parent == matrixVariable) {`
`56`		`- return ctx.gradient(this);`
	`57`	`+ return selfGradient;`
`57`	`58`	`} else {`
`58`		`- return new Scalar(ctx.gradient(this).aggregateSum());`
	`59`	`+ return new Scalar(selfGradient.aggregateSum());`
`59`	`60`	`}`
`60`	`61`	`}`
`61`	`62`	`}`
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ public Scalar apply(ComputationContext ctx) {`
`45`	`45`
`46`	`46`	`@Override`
`47`	`47`	`public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {`
`48`		`- return ctx.data(parent).map(ignore -> ctx.gradient(this).value());`
	`48`	`+ double selfGradient = ctx.gradient(this).value();`
	`49`	`+ return ctx.data(parent).map(ignore -> selfGradient);`
`49`	`50`	`}`
`50`	`51`	`}`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ public Scalar apply(ComputationContext ctx) {`
`54`	`54`
`55`	`55`	`@Override`
`56`	`56`	`public Matrix gradientForParent(ComputationContext ctx) {`
`57`		`- return ctx.data(parent).copy().scalarMultiply(2 * ctx.gradient(this).value());`
	`57`	`+ double selfGradient = ctx.gradient(this).value();`
	`58`	`+ return ctx.data(parent).scalarMultiply(2 * selfGradient);`
`58`	`59`	`}`
`59`	`60`	`}`