ChEB-AI
diff --git a/‎chebai/callbacks/prediction_callback.py‎
Lines changed: 25 additions & 0 deletions b/‎chebai/callbacks/prediction_callback.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎chebai/loss/bce_weighted.py‎
Lines changed: 55 additions & 0 deletions b/‎chebai/loss/bce_weighted.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎chebai/loss/semantic.py‎
Lines changed: 88 additions & 37 deletions b/‎chebai/loss/semantic.py‎
Lines changed: 88 additions & 37 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 15 additions & 0 deletions b/‎chebai/models/base.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎chebai/models/electra.py‎
Lines changed: 5 additions & 17 deletions b/‎chebai/models/electra.py‎
Lines changed: 5 additions & 17 deletions
diff --git a/‎chebai/preprocessing/bin/smiles_token/tokens.txt‎
Lines changed: 2 additions & 0 deletions b/‎chebai/preprocessing/bin/smiles_token/tokens.txt‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,25 @@
+from lightning.pytorch.callbacks import BasePredictionWriter
+import torch
+import os
+import pickle
+
+
+class PredictionWriter(BasePredictionWriter):
+    def __init__(self, output_dir, write_interval):
+        super().__init__(write_interval)
+        self.output_dir = output_dir
+        self.prediction_file_name = "predictions.pkl"
+
+    def write_on_epoch_end(self, trainer, pl_module, predictions, batch_indices):
+        results = [
+            dict(
+                ident=row["data"]["idents"][0],
+                predictions=torch.sigmoid(row["output"]["logits"]).numpy(),
+                labels=row["labels"][0].numpy() if row["labels"] is not None else None,
+            )
+            for row in predictions
+        ]
+        with open(
+            os.path.join(self.output_dir, self.prediction_file_name), "wb"
+        ) as fout:
+            pickle.dump(results, fout)
@@ -0,0 +1,55 @@
+import torch
+from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
+import pandas as pd
+import os
+import pickle
+
+
+class BCEWeighted(torch.nn.BCEWithLogitsLoss):
+    """BCEWithLogitsLoss with weights automatically computed according to beta parameter (formula from
+    https://openaccess.thecvf.com/content_CVPR_2019/papers/Cui_Class-Balanced_Loss_Based_on_Effective_Number_of_Samples_CVPR_2019_paper.pdf)
+    """
+
+    def __init__(self, beta: float = None, data_extractor: _ChEBIDataExtractor = None):
+        self.beta = beta
+        self.data_extractor = data_extractor
+        super().__init__()
+
+    def set_pos_weight(self, input):
+        if (
+            self.beta is not None
+            and self.data_extractor is not None
+            and all(
+                os.path.exists(os.path.join(self.data_extractor.raw_dir, raw_file))
+                for raw_file in self.data_extractor.raw_file_names
+            )
+            and self.pos_weight is None
+        ):
+            complete_data = pd.concat(
+                [
+                    pickle.load(
+                        open(
+                            os.path.join(
+                                self.data_extractor.raw_dir,
+                                self.data_extractor.raw_file_names_dict[set],
+                            ),
+                            "rb",
+                        )
+                    )
+                    for set in ["train", "validation", "test"]
+                ]
+            )
+            value_counts = []
+            for c in complete_data.columns[3:]:
+                value_counts.append(len([v for v in complete_data[c] if v]))
+            weights = [
+                (1 - self.beta) / (1 - pow(self.beta, value)) for value in value_counts
+            ]
+            mean = sum(weights) / len(weights)
+            self.pos_weight = torch.tensor(
+                [w / mean for w in weights], device=input.device
+            )
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        self.set_pos_weight(input)
+        return super().forward(input, target)
@@ -2,24 +2,40 @@
 import os
 import pickle
 
+import math
 import torch
+from typing import Literal
 
-from chebai.models.electra import extract_class_hierarchy
-
-IMPLICATION_CACHE_FILE = "chebi.cache"
+from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor, ChEBIOver100
 
 
 class ImplicationLoss(torch.nn.Module):
     def __init__(
-        self, path_to_chebi, path_to_label_names, base_loss: torch.nn.Module = None
+        self,
+        data_extractor: _ChEBIDataExtractor,
+        base_loss: torch.nn.Module = None,
+        tnorm: Literal["product", "lukasiewicz", "xu19"] = "product",
+        impl_loss_weight=0.1,  # weight of implication loss in relation to base_loss
+        pos_scalar=1,
+        pos_epsilon=0.01,
     ):
         super().__init__()
+        self.data_extractor = data_extractor
         self.base_loss = base_loss
-        label_names = _load_label_names(path_to_label_names)
-        hierarchy = _load_implications(path_to_chebi)
-        implication_filter = _build_implication_filter(label_names, hierarchy)
+        self.implication_cache_file = f"implications_{self.data_extractor.name}.cache"
+        self.label_names = _load_label_names(
+            os.path.join(data_extractor.raw_dir, "classes.txt")
+        )
+        self.hierarchy = self._load_implications(
+            os.path.join(data_extractor.raw_dir, "chebi.obo")
+        )
+        implication_filter = _build_implication_filter(self.label_names, self.hierarchy)
         self.implication_filter_l = implication_filter[:, 0]
         self.implication_filter_r = implication_filter[:, 1]
+        self.tnorm = tnorm
+        self.impl_weight = impl_loss_weight
+        self.pos_scalar = pos_scalar
+        self.eps = pos_epsilon
 
     def forward(self, input, target, **kwargs):
         nnl = kwargs.pop("non_null_labels", None)
@@ -36,40 +52,77 @@ def forward(self, input, target, **kwargs):
         r = pred[:, self.implication_filter_r]
         # implication_loss = torch.sqrt(torch.mean(torch.sum(l*(1-r), dim=-1), dim=0))
         implication_loss = self._calculate_implication_loss(l, r)
-        return base_loss + implication_loss
+
+        return (
+            base_loss + self.impl_weight * implication_loss,
+            base_loss,
+            implication_loss,
+        )
 
     def _calculate_implication_loss(self, l, r):
-        capped_difference = torch.relu(l - r)
+        assert not l.isnan().any()
+        assert not r.isnan().any()
+        if self.pos_scalar != 1:
+            l = (
+                torch.pow(l + self.eps, 1 / self.pos_scalar)
+                - math.pow(self.eps, 1 / self.pos_scalar)
+            ) / (
+                math.pow(1 + self.eps, 1 / self.pos_scalar)
+                - math.pow(self.eps, 1 / self.pos_scalar)
+            )
+            r = torch.pow(r, self.pos_scalar)
+        if self.tnorm == "product":
+            individual_loss = l * (1 - r)
+        elif self.tnorm == "xu19":
+            individual_loss = -torch.log(1 - l * (1 - r))
+        elif self.tnorm == "lukasiewicz":
+            individual_loss = torch.relu(l - r)
+        else:
+            raise NotImplementedError(f"Unknown tnorm {self.tnorm}")
+
         return torch.mean(
-            torch.sum(
-                (torch.softmax(capped_difference, dim=-1) * capped_difference), dim=-1
-            ),
+            torch.sum(individual_loss, dim=-1),
             dim=0,
         )
 
+    def _load_implications(self, path_to_chebi):
+        if os.path.isfile(self.implication_cache_file):
+            with open(self.implication_cache_file, "rb") as fin:
+                hierarchy = pickle.load(fin)
+        else:
+            hierarchy = self.data_extractor.extract_class_hierarchy(path_to_chebi)
+            with open(self.implication_cache_file, "wb") as fout:
+                pickle.dump(hierarchy, fout)
+        return hierarchy
+
 
 class DisjointLoss(ImplicationLoss):
     def __init__(
         self,
-        path_to_chebi,
-        path_to_label_names,
-        path_to_disjointedness,
+        path_to_disjointness,
+        data_extractor: _ChEBIDataExtractor,
         base_loss: torch.nn.Module = None,
+        disjoint_loss_weight=100,
+        **kwargs,
     ):
-        super().__init__(path_to_chebi, path_to_label_names, base_loss)
-        label_names = _load_label_names(path_to_label_names)
-        hierarchy = _load_implications(path_to_chebi)
+        super().__init__(data_extractor, base_loss, **kwargs)
         self.disjoint_filter_l, self.disjoint_filter_r = _build_disjointness_filter(
-            path_to_disjointedness, label_names, hierarchy
+            path_to_disjointness, self.label_names, self.hierarchy
         )
+        self.disjoint_weight = disjoint_loss_weight
 
     def forward(self, input, target, **kwargs):
-        loss = super().forward(input, target, **kwargs)
+        loss, base_loss, impl_loss = super().forward(input, target, **kwargs)
         pred = torch.sigmoid(input)
         l = pred[:, self.disjoint_filter_l]
         r = pred[:, self.disjoint_filter_r]
         disjointness_loss = self._calculate_implication_loss(l, 1 - r)
-        return loss + disjointness_loss
+        return (
+            loss + self.disjoint_weight * disjointness_loss,
+            base_loss,
+            impl_loss,
+            disjointness_loss,
+        )
 
 
 def _load_label_names(path_to_label_names):
@@ -78,17 +131,6 @@ def _load_label_names(path_to_label_names):
     return label_names
 
 
-def _load_implications(path_to_chebi, implication_cache=IMPLICATION_CACHE_FILE):
-    if os.path.isfile(implication_cache):
-        with open(implication_cache, "rb") as fin:
-            hierarchy = pickle.load(fin)
-    else:
-        hierarchy = extract_class_hierarchy(path_to_chebi)
-        with open(implication_cache, "wb") as fout:
-            pickle.dump(hierarchy, fout)
-    return hierarchy
-
-
 def _build_implication_filter(label_names, hierarchy):
     return torch.tensor(
         [
@@ -100,24 +142,33 @@ def _build_implication_filter(label_names, hierarchy):
     )
 
 
-def _build_disjointness_filter(path_to_disjointedness, label_names, hierarchy):
+def _build_disjointness_filter(path_to_disjointness, label_names, hierarchy):
     disjoints = set()
     label_dict = dict(map(reversed, enumerate(label_names)))
 
-    with open(path_to_disjointedness, "rt") as fin:
+    with open(path_to_disjointness, "rt") as fin:
         reader = csv.reader(fin)
         for l1_raw, r1_raw in reader:
             l1 = int(l1_raw)
             r1 = int(r1_raw)
+            if l1 == 36233 and r1 == 63353:
+                # ignore disaccharide-disaccharide derivative disjointness axiom
+                continue
             disjoints.update(
                 {
                     (label_dict[l2], label_dict[r2])
-                    for r2 in hierarchy.succ[r1]
+                    for r2 in list(hierarchy.succ[r1]) + [r1]
                     if r2 in label_names
-                    for l2 in hierarchy.succ[l1]
-                    if l2 in label_names and l2 < r2
+                    for l2 in list(hierarchy.succ[l1]) + [l1]
+                    if l2 in label_names
                 }
             )
 
     dis_filter = torch.tensor(list(disjoints))
     return dis_filter[:, 0], dis_filter[:, 1]
+
+
+if __name__ == "__main__":
+    loss = DisjointLoss(
+        os.path.join("data", "disjoint.csv"), ChEBIOver100(chebi_version=227)
+    )
@@ -133,6 +133,21 @@ def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=Fal
                 if self.pass_loss_kwargs:
                     loss_kwargs = loss_kwargs_candidates
                 loss = self.criterion(loss_data, loss_labels, **loss_kwargs)
+                if isinstance(loss, tuple):
+                    loss_additional = loss[1:]
+                    for i, loss_add in enumerate(loss_additional):
+                        self.log(
+                            f"{prefix}loss_{i}",
+                            loss_add if isinstance(loss_add, int) else loss_add.item(),
+                            batch_size=len(batch),
+                            on_step=True,
+                            on_epoch=False,
+                            prog_bar=False,
+                            logger=True,
+                            sync_dist=sync_dist,
+                        )
+                    loss = loss[0]
+
                 d["loss"] = loss
                 self.log(
                     f"{prefix}loss",
 
@@ -18,6 +18,8 @@
 
 logging.getLogger("pysmiles").setLevel(logging.CRITICAL)
 
+from chebai.loss.semantic import DisjointLoss as ElectraChEBIDisjointLoss  # noqa
+
 
 class ElectraPre(ChebaiBaseNet):
     """
@@ -245,14 +247,9 @@ def _process_for_loss(self, model_output, labels, loss_kwargs):
 
         """
         kwargs_copy = dict(loss_kwargs)
-        mask = kwargs_copy.pop("target_mask", None)
-        if mask is not None:
-            d = model_output["logits"] * mask - 100 * ~mask
-        else:
-            d = model_output["logits"]
         if labels is not None:
             labels = labels.float()
-        return d, labels, kwargs_copy
+        return model_output["logits"], labels, kwargs_copy
 
     def _get_prediction_and_labels(self, data, labels, model_output):
         """
@@ -267,16 +264,12 @@ def _get_prediction_and_labels(self, data, labels, model_output):
             tuple: A tuple containing the predictions and labels.
 
         """
-        mask = model_output.get("target_mask")
-        if mask is not None:
-            d = model_output["logits"] * mask - 100 * ~mask
-        else:
-            d = model_output["logits"]
+        d = model_output["logits"]
         loss_kwargs = data.get("loss_kwargs", dict())
         if "non_null_labels" in loss_kwargs:
             n = loss_kwargs["non_null_labels"]
             d = d[n]
-        return torch.sigmoid(d), labels.int()
+        return torch.sigmoid(d), labels.int() if labels is not None else None
 
     def forward(self, data, **kwargs):
         """
@@ -303,7 +296,6 @@ def forward(self, data, **kwargs):
         return dict(
             logits=self.output(d),
             attentions=electra.attentions,
-            target_mask=data.get("target_mask"),
         )
 
 
@@ -359,7 +351,6 @@ def _process_batch(self, batch, batch_idx):
             features=torch.cat((cls_tokens, batch.x), dim=1),
             labels=batch.y,
             model_kwargs=dict(attention_mask=mask),
-            target_mask=batch.target_mask,
         )
 
     @property
@@ -418,7 +409,6 @@ def __init__(self, cone_dimensions=20, **kwargs):
         )
 
     def _get_data_for_loss(self, model_output, labels):
-        mask = model_output.get("target_mask")
         d = model_output["predicted_vectors"]
         return dict(
             input=dict(
@@ -428,7 +418,6 @@ def _get_data_for_loss(self, model_output, labels):
         )
 
     def _get_prediction_and_labels(self, data, labels, model_output):
-        mask = model_output.get("target_mask")
         d = model_output["predicted_vectors"].unsqueeze(1)
 
         d = in_cone_parts(d, self.cone_axes, self.cone_arcs)
@@ -444,7 +433,6 @@ def forward(self, data, **kwargs):
         return dict(
             predicted_vectors=self.line_embedding(d),
             attentions=electra.attentions,
-            target_mask=data.get("target_mask"),
         )
 
 
 
@@ -767,3 +767,5 @@ p
 [Nd]
 [Ti+3]
 [14CH3]
+[HH]
+[CH3-]
-Original file line number
+Diff line change
 [Nd]
 [Ti+3]
 [14CH3]
 +[HH]
 +[CH3-]