Consider self gradient in ReducedCrossEntropyLoss

FlorentinD · FlorentinD · commit ed19d0ab9d4c · 2022-04-25T10:46:54.000+02:00
diff --git a/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLoss.java b/ml/ml-core/src/main/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLoss.java
@@ -90,6 +90,9 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
         var predMatrix = ctx.forward(predictions);
         var labelsVector = ctx.data(labels);
         int numberOfExamples = labelsVector.length();
+
+        var selfGradient = ctx.gradient(this).value();
+
         if (parent == weights) {
             var weightsMatrix = ctx.data(weights);
             var featureMatrix = ctx.data(features);
@@ -104,7 +107,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
                     var indicatorIsTrueClass = trueClass == classIdx ? 1.0 : 0.0;
                     var errorPerExample = (predictedClassProbability - indicatorIsTrueClass) / numberOfExamples;
                     for (int feature = 0; feature < featureCount; feature++) {
-                        gradient.addDataAt(classIdx, feature, errorPerExample * featureMatrix.dataAt(row, feature));
+                        gradient.addDataAt(classIdx, feature, selfGradient * errorPerExample * featureMatrix.dataAt(row, feature));
                     }
                 }
             }
@@ -120,7 +123,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {
                     double predictedClassProbability = predMatrix.dataAt(row, classIdx);
                     var indicatorIsTrueClass = trueClass == classIdx ? 1.0 : 0.0;
                     var errorPerExample = (predictedClassProbability - indicatorIsTrueClass) / numberOfExamples;
-                    gradient.addDataAt(classIdx, errorPerExample);
+                    gradient.addDataAt(classIdx, selfGradient * errorPerExample);
                 }
             }
             return gradient;
diff --git a/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLossTest.java b/ml/ml-core/src/test/java/org/neo4j/gds/ml/core/functions/ReducedCrossEntropyLossTest.java
@@ -172,6 +172,35 @@ void shouldComputeGradientCorrectlyStandard() {
         finiteDifferenceShouldApproximateGradient(List.of(bias, weights), loss);
     }
 
+    @Test
+    void considerSelfGradient() {
+        var features = Constant.matrix(
+            new double[]{0.23, 0.52, 0.62, 0.32, 0.64, 0.71, 0.29, -0.52, 0.12, -0.92, 0.6, -0.11},
+            3,
+            4
+        );
+        var labels = Constant.vector(new double[]{1.0, 0.0, 2.0});
+
+        var weights = new Weights<>(new Matrix(new double[]{0.35, 0.41, 1.0, 0.1, 0.54, 0.12, 0.81, 0.7}, 2, 4));
+        var bias = Weights.ofVector(0.37, 0.37);
+
+        var weightedFeatures = new MatrixMultiplyWithTransposedSecondOperand(features, weights);
+        var affineVariable = new MatrixVectorSum(weightedFeatures, bias);
+
+        var predictions = new ReducedSoftmax(affineVariable);
+
+        var loss = new ReducedCrossEntropyLoss(
+            predictions,
+            weights,
+            bias,
+            features,
+            labels
+        );
+        var chainedLoss = new Sigmoid<>(loss);
+
+        finiteDifferenceShouldApproximateGradient(List.of(bias, weights), chainedLoss);
+    }
+
     @Override
     public double epsilon() {
         return 1e-7;

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,9 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {`
`90`	`90`	`var predMatrix = ctx.forward(predictions);`
`91`	`91`	`var labelsVector = ctx.data(labels);`
`92`	`92`	`int numberOfExamples = labelsVector.length();`
	`93`	`+`
	`94`	`+ var selfGradient = ctx.gradient(this).value();`
	`95`	`+`
`93`	`96`	`if (parent == weights) {`
`94`	`97`	`var weightsMatrix = ctx.data(weights);`
`95`	`98`	`var featureMatrix = ctx.data(features);`
`@@ -104,7 +107,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {`
`104`	`107`	`var indicatorIsTrueClass = trueClass == classIdx ? 1.0 : 0.0;`
`105`	`108`	`var errorPerExample = (predictedClassProbability - indicatorIsTrueClass) / numberOfExamples;`
`106`	`109`	`for (int feature = 0; feature < featureCount; feature++) {`
`107`		`- gradient.addDataAt(classIdx, feature, errorPerExample * featureMatrix.dataAt(row, feature));`
	`110`	`+ gradient.addDataAt(classIdx, feature, selfGradient * errorPerExample * featureMatrix.dataAt(row, feature));`
`108`	`111`	`}`
`109`	`112`	`}`
`110`	`113`	`}`
`@@ -120,7 +123,7 @@ public Tensor<?> gradient(Variable<?> parent, ComputationContext ctx) {`
`120`	`123`	`double predictedClassProbability = predMatrix.dataAt(row, classIdx);`
`121`	`124`	`var indicatorIsTrueClass = trueClass == classIdx ? 1.0 : 0.0;`
`122`	`125`	`var errorPerExample = (predictedClassProbability - indicatorIsTrueClass) / numberOfExamples;`
`123`		`- gradient.addDataAt(classIdx, errorPerExample);`
	`126`	`+ gradient.addDataAt(classIdx, selfGradient * errorPerExample);`
`124`	`127`	`}`
`125`	`128`	`}`
`126`	`129`	`return gradient;`