ChEB-AI
diff --git a/‎chebai/callbacks/prediction_callback.py‎
Lines changed: 25 additions & 0 deletions b/‎chebai/callbacks/prediction_callback.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎chebai/loss/bce_weighted.py‎
Lines changed: 55 additions & 0 deletions b/‎chebai/loss/bce_weighted.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎chebai/loss/semantic.py‎
Lines changed: 88 additions & 37 deletions b/‎chebai/loss/semantic.py‎
Lines changed: 88 additions & 37 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 58 additions & 0 deletions b/‎chebai/models/base.py‎
Lines changed: 58 additions & 0 deletions
@@ -0,0 +1,25 @@
+from lightning.pytorch.callbacks import BasePredictionWriter
+import torch
+import os
+import pickle
+
+
+class PredictionWriter(BasePredictionWriter):
+    def __init__(self, output_dir, write_interval):
+        super().__init__(write_interval)
+        self.output_dir = output_dir
+        self.prediction_file_name = "predictions.pkl"
+
+    def write_on_epoch_end(self, trainer, pl_module, predictions, batch_indices):
+        results = [
+            dict(
+                ident=row["data"]["idents"][0],
+                predictions=torch.sigmoid(row["output"]["logits"]).numpy(),
+                labels=row["labels"][0].numpy() if row["labels"] is not None else None,
+            )
+            for row in predictions
+        ]
+        with open(
+            os.path.join(self.output_dir, self.prediction_file_name), "wb"
+        ) as fout:
+            pickle.dump(results, fout)
@@ -0,0 +1,55 @@
+import torch
+from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
+import pandas as pd
+import os
+import pickle
+
+
+class BCEWeighted(torch.nn.BCEWithLogitsLoss):
+    """BCEWithLogitsLoss with weights automatically computed according to beta parameter (formula from
+    https://openaccess.thecvf.com/content_CVPR_2019/papers/Cui_Class-Balanced_Loss_Based_on_Effective_Number_of_Samples_CVPR_2019_paper.pdf)
+    """
+
+    def __init__(self, beta: float = None, data_extractor: _ChEBIDataExtractor = None):
+        self.beta = beta
+        self.data_extractor = data_extractor
+        super().__init__()
+
+    def set_pos_weight(self, input):
+        if (
+            self.beta is not None
+            and self.data_extractor is not None
+            and all(
+                os.path.exists(os.path.join(self.data_extractor.raw_dir, raw_file))
+                for raw_file in self.data_extractor.raw_file_names
+            )
+            and self.pos_weight is None
+        ):
+            complete_data = pd.concat(
+                [
+                    pickle.load(
+                        open(
+                            os.path.join(
+                                self.data_extractor.raw_dir,
+                                self.data_extractor.raw_file_names_dict[set],
+                            ),
+                            "rb",
+                        )
+                    )
+                    for set in ["train", "validation", "test"]
+                ]
+            )
+            value_counts = []
+            for c in complete_data.columns[3:]:
+                value_counts.append(len([v for v in complete_data[c] if v]))
+            weights = [
+                (1 - self.beta) / (1 - pow(self.beta, value)) for value in value_counts
+            ]
+            mean = sum(weights) / len(weights)
+            self.pos_weight = torch.tensor(
+                [w / mean for w in weights], device=input.device
+            )
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        self.set_pos_weight(input)
+        return super().forward(input, target)
@@ -2,24 +2,40 @@
 import os
 import pickle
 
+import math
 import torch
+from typing import Literal
 
-from chebai.models.electra import extract_class_hierarchy
-
-IMPLICATION_CACHE_FILE = "chebi.cache"
+from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor, ChEBIOver100
 
 
 class ImplicationLoss(torch.nn.Module):
     def __init__(
-        self, path_to_chebi, path_to_label_names, base_loss: torch.nn.Module = None
+        self,
+        data_extractor: _ChEBIDataExtractor,
+        base_loss: torch.nn.Module = None,
+        tnorm: Literal["product", "lukasiewicz", "xu19"] = "product",
+        impl_loss_weight=0.1,  # weight of implication loss in relation to base_loss
+        pos_scalar=1,
+        pos_epsilon=0.01,
     ):
         super().__init__()
+        self.data_extractor = data_extractor
         self.base_loss = base_loss
-        label_names = _load_label_names(path_to_label_names)
-        hierarchy = _load_implications(path_to_chebi)
-        implication_filter = _build_implication_filter(label_names, hierarchy)
+        self.implication_cache_file = f"implications_{self.data_extractor.name}.cache"
+        self.label_names = _load_label_names(
+            os.path.join(data_extractor.raw_dir, "classes.txt")
+        )
+        self.hierarchy = self._load_implications(
+            os.path.join(data_extractor.raw_dir, "chebi.obo")
+        )
+        implication_filter = _build_implication_filter(self.label_names, self.hierarchy)
         self.implication_filter_l = implication_filter[:, 0]
         self.implication_filter_r = implication_filter[:, 1]
+        self.tnorm = tnorm
+        self.impl_weight = impl_loss_weight
+        self.pos_scalar = pos_scalar
+        self.eps = pos_epsilon
 
     def forward(self, input, target, **kwargs):
         nnl = kwargs.pop("non_null_labels", None)
@@ -36,40 +52,77 @@ def forward(self, input, target, **kwargs):
         r = pred[:, self.implication_filter_r]
         # implication_loss = torch.sqrt(torch.mean(torch.sum(l*(1-r), dim=-1), dim=0))
         implication_loss = self._calculate_implication_loss(l, r)
-        return base_loss + implication_loss
+
+        return (
+            base_loss + self.impl_weight * implication_loss,
+            base_loss,
+            implication_loss,
+        )
 
     def _calculate_implication_loss(self, l, r):
-        capped_difference = torch.relu(l - r)
+        assert not l.isnan().any()
+        assert not r.isnan().any()
+        if self.pos_scalar != 1:
+            l = (
+                torch.pow(l + self.eps, 1 / self.pos_scalar)
+                - math.pow(self.eps, 1 / self.pos_scalar)
+            ) / (
+                math.pow(1 + self.eps, 1 / self.pos_scalar)
+                - math.pow(self.eps, 1 / self.pos_scalar)
+            )
+            r = torch.pow(r, self.pos_scalar)
+        if self.tnorm == "product":
+            individual_loss = l * (1 - r)
+        elif self.tnorm == "xu19":
+            individual_loss = -torch.log(1 - l * (1 - r))
+        elif self.tnorm == "lukasiewicz":
+            individual_loss = torch.relu(l - r)
+        else:
+            raise NotImplementedError(f"Unknown tnorm {self.tnorm}")
+
         return torch.mean(
-            torch.sum(
-                (torch.softmax(capped_difference, dim=-1) * capped_difference), dim=-1
-            ),
+            torch.sum(individual_loss, dim=-1),
             dim=0,
         )
 
+    def _load_implications(self, path_to_chebi):
+        if os.path.isfile(self.implication_cache_file):
+            with open(self.implication_cache_file, "rb") as fin:
+                hierarchy = pickle.load(fin)
+        else:
+            hierarchy = self.data_extractor.extract_class_hierarchy(path_to_chebi)
+            with open(self.implication_cache_file, "wb") as fout:
+                pickle.dump(hierarchy, fout)
+        return hierarchy
+
 
 class DisjointLoss(ImplicationLoss):
     def __init__(
         self,
-        path_to_chebi,
-        path_to_label_names,
-        path_to_disjointedness,
+        path_to_disjointness,
+        data_extractor: _ChEBIDataExtractor,
         base_loss: torch.nn.Module = None,
+        disjoint_loss_weight=100,
+        **kwargs,
     ):
-        super().__init__(path_to_chebi, path_to_label_names, base_loss)
-        label_names = _load_label_names(path_to_label_names)
-        hierarchy = _load_implications(path_to_chebi)
+        super().__init__(data_extractor, base_loss, **kwargs)
         self.disjoint_filter_l, self.disjoint_filter_r = _build_disjointness_filter(
-            path_to_disjointedness, label_names, hierarchy
+            path_to_disjointness, self.label_names, self.hierarchy
         )
+        self.disjoint_weight = disjoint_loss_weight
 
     def forward(self, input, target, **kwargs):
-        loss = super().forward(input, target, **kwargs)
+        loss, base_loss, impl_loss = super().forward(input, target, **kwargs)
         pred = torch.sigmoid(input)
         l = pred[:, self.disjoint_filter_l]
         r = pred[:, self.disjoint_filter_r]
         disjointness_loss = self._calculate_implication_loss(l, 1 - r)
-        return loss + disjointness_loss
+        return (
+            loss + self.disjoint_weight * disjointness_loss,
+            base_loss,
+            impl_loss,
+            disjointness_loss,
+        )
 
 
 def _load_label_names(path_to_label_names):
@@ -78,17 +131,6 @@ def _load_label_names(path_to_label_names):
     return label_names
 
 
-def _load_implications(path_to_chebi, implication_cache=IMPLICATION_CACHE_FILE):
-    if os.path.isfile(implication_cache):
-        with open(implication_cache, "rb") as fin:
-            hierarchy = pickle.load(fin)
-    else:
-        hierarchy = extract_class_hierarchy(path_to_chebi)
-        with open(implication_cache, "wb") as fout:
-            pickle.dump(hierarchy, fout)
-    return hierarchy
-
-
 def _build_implication_filter(label_names, hierarchy):
     return torch.tensor(
         [
@@ -100,24 +142,33 @@ def _build_implication_filter(label_names, hierarchy):
     )
 
 
-def _build_disjointness_filter(path_to_disjointedness, label_names, hierarchy):
+def _build_disjointness_filter(path_to_disjointness, label_names, hierarchy):
     disjoints = set()
     label_dict = dict(map(reversed, enumerate(label_names)))
 
-    with open(path_to_disjointedness, "rt") as fin:
+    with open(path_to_disjointness, "rt") as fin:
         reader = csv.reader(fin)
         for l1_raw, r1_raw in reader:
             l1 = int(l1_raw)
             r1 = int(r1_raw)
+            if l1 == 36233 and r1 == 63353:
+                # ignore disaccharide-disaccharide derivative disjointness axiom
+                continue
             disjoints.update(
                 {
                     (label_dict[l2], label_dict[r2])
-                    for r2 in hierarchy.succ[r1]
+                    for r2 in list(hierarchy.succ[r1]) + [r1]
                     if r2 in label_names
-                    for l2 in hierarchy.succ[l1]
-                    if l2 in label_names and l2 < r2
+                    for l2 in list(hierarchy.succ[l1]) + [l1]
+                    if l2 in label_names
                 }
             )
 
     dis_filter = torch.tensor(list(disjoints))
     return dis_filter[:, 0], dis_filter[:, 1]
+
+
+if __name__ == "__main__":
+    loss = DisjointLoss(
+        os.path.join("data", "disjoint.csv"), ChEBIOver100(chebi_version=227)
+    )
@@ -13,6 +13,24 @@
 
 
 class ChebaiBaseNet(LightningModule):
+    """
+    Base class for Chebai neural network models inheriting from PyTorch Lightning's LightningModule.
+
+    Args:
+        criterion (torch.nn.Module, optional): The loss criterion for the model. Defaults to None.
+        out_dim (int, optional): The output dimension of the model. Defaults to None.
+        train_metrics (torch.nn.Module, optional): The metrics to be used during training. Defaults to None.
+        val_metrics (torch.nn.Module, optional): The metrics to be used during validation. Defaults to None.
+        test_metrics (torch.nn.Module, optional): The metrics to be used during testing. Defaults to None.
+        pass_loss_kwargs (bool, optional): Whether to pass loss kwargs to the criterion. Defaults to True.
+        optimizer_kwargs (typing.Dict, optional): Additional keyword arguments for the optimizer. Defaults to None.
+        **kwargs: Additional keyword arguments.
+
+    Attributes:
+        NAME (str): The name of the model.
+        LOSS (torch.nn.Module): The loss function used by the model.
+    """
+
     NAME = None
     LOSS = torch.nn.BCEWithLogitsLoss
 
@@ -85,6 +103,20 @@ def predict_step(self, batch, batch_idx, **kwargs):
         return self._execute(batch, batch_idx, self.test_metrics, prefix="", log=False)
 
     def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=False):
+        """
+        Executes the model on a batch of data and returns the model output and predictions.
+
+        Args:
+            batch (XYData): The input batch of data.
+            batch_idx (int): The index of the current batch.
+            metrics (dict): A dictionary of metrics to track.
+            prefix (str, optional): A prefix to add to the metric names. Defaults to "".
+            log (bool, optional): Whether to log the metrics. Defaults to True.
+            sync_dist (bool, optional): Whether to synchronize distributed training. Defaults to False.
+
+        Returns:
+            dict: A dictionary containing the processed data, labels, model_output, predictions, and loss (if applicable).
+        """
         assert isinstance(batch, XYData)
         batch = batch.to(self.device)
         data = self._process_batch(batch, batch_idx)
@@ -101,6 +133,21 @@ def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=Fal
                 if self.pass_loss_kwargs:
                     loss_kwargs = loss_kwargs_candidates
                 loss = self.criterion(loss_data, loss_labels, **loss_kwargs)
+                if isinstance(loss, tuple):
+                    loss_additional = loss[1:]
+                    for i, loss_add in enumerate(loss_additional):
+                        self.log(
+                            f"{prefix}loss_{i}",
+                            loss_add if isinstance(loss_add, int) else loss_add.item(),
+                            batch_size=len(batch),
+                            on_step=True,
+                            on_epoch=False,
+                            prog_bar=False,
+                            logger=True,
+                            sync_dist=sync_dist,
+                        )
+                    loss = loss[0]
+
                 d["loss"] = loss
                 self.log(
                     f"{prefix}loss",
@@ -119,6 +166,17 @@ def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=Fal
         return d
 
     def _log_metrics(self, prefix, metrics, batch_size):
+        """
+        Logs the metrics for the given prefix.
+
+        Args:
+            prefix (str): The prefix to be added to the metric names.
+            metrics (dict): A dictionary containing the metrics to be logged.
+            batch_size (int): The batch size used for logging.
+
+        Returns:
+            None
+        """
         # don't use sync_dist=True if the metric is a torchmetrics-metric
         # (see https://github.com/Lightning-AI/pytorch-lightning/discussions/6501#discussioncomment-569757)
         for metric_name, metric in metrics.items():