Merge pull request #19 from ChEB-AI/code_documentation

sfluegel05 · web-flow · commit c06b05886f1a · 2024-04-02T15:11:12.000+02:00
Code documentation
diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -13,6 +13,24 @@
 
 
 class ChebaiBaseNet(LightningModule):
+    """
+    Base class for Chebai neural network models inheriting from PyTorch Lightning's LightningModule.
+
+    Args:
+        criterion (torch.nn.Module, optional): The loss criterion for the model. Defaults to None.
+        out_dim (int, optional): The output dimension of the model. Defaults to None.
+        train_metrics (torch.nn.Module, optional): The metrics to be used during training. Defaults to None.
+        val_metrics (torch.nn.Module, optional): The metrics to be used during validation. Defaults to None.
+        test_metrics (torch.nn.Module, optional): The metrics to be used during testing. Defaults to None.
+        pass_loss_kwargs (bool, optional): Whether to pass loss kwargs to the criterion. Defaults to True.
+        optimizer_kwargs (typing.Dict, optional): Additional keyword arguments for the optimizer. Defaults to None.
+        **kwargs: Additional keyword arguments.
+
+    Attributes:
+        NAME (str): The name of the model.
+        LOSS (torch.nn.Module): The loss function used by the model.
+    """
+
     NAME = None
     LOSS = torch.nn.BCEWithLogitsLoss
 
@@ -85,6 +103,20 @@ def predict_step(self, batch, batch_idx, **kwargs):
         return self._execute(batch, batch_idx, self.test_metrics, prefix="", log=False)
 
     def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=False):
+        """
+        Executes the model on a batch of data and returns the model output and predictions.
+
+        Args:
+            batch (XYData): The input batch of data.
+            batch_idx (int): The index of the current batch.
+            metrics (dict): A dictionary of metrics to track.
+            prefix (str, optional): A prefix to add to the metric names. Defaults to "".
+            log (bool, optional): Whether to log the metrics. Defaults to True.
+            sync_dist (bool, optional): Whether to synchronize distributed training. Defaults to False.
+
+        Returns:
+            dict: A dictionary containing the processed data, labels, model_output, predictions, and loss (if applicable).
+        """
         assert isinstance(batch, XYData)
         batch = batch.to(self.device)
         data = self._process_batch(batch, batch_idx)
@@ -134,6 +166,17 @@ def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=Fal
         return d
 
     def _log_metrics(self, prefix, metrics, batch_size):
+        """
+        Logs the metrics for the given prefix.
+
+        Args:
+            prefix (str): The prefix to be added to the metric names.
+            metrics (dict): A dictionary containing the metrics to be logged.
+            batch_size (int): The batch size used for logging.
+
+        Returns:
+            None
+        """
         # don't use sync_dist=True if the metric is a torchmetrics-metric
         # (see https://github.com/Lightning-AI/pytorch-lightning/discussions/6501#discussioncomment-569757)
         for metric_name, metric in metrics.items():
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -22,6 +22,23 @@
 
 
 class ElectraPre(ChebaiBaseNet):
+    """
+    ElectraPre class represents an Electra model for pre-training inherited from ChebaiBaseNet.
+
+    Args:
+        config (dict): Configuration parameters for the Electra model.
+        **kwargs: Additional keyword arguments (passed to parent class).
+
+    Attributes:
+        NAME (str): Name of the ElectraPre model.
+        generator_config (ElectraConfig): Configuration for the generator model.
+        generator (ElectraForMaskedLM): Generator model for masked language modeling.
+        discriminator_config (ElectraConfig): Configuration for the discriminator model.
+        discriminator (ElectraForPreTraining): Discriminator model for pre-training.
+        replace_p (float): Probability of replacing tokens during training.
+
+    """
+
     NAME = "ElectraPre"
 
     def __init__(self, config=None, **kwargs):
@@ -34,12 +51,32 @@ def __init__(self, config=None, **kwargs):
 
     @property
     def as_pretrained(self):
+        """
+        Returns the discriminator model as a pre-trained model.
+
+        Returns:
+            ElectraForPreTraining: The discriminator model.
+
+        """
         return self.discriminator
 
     def _process_labels_in_batch(self, batch):
         return None
 
     def forward(self, data, **kwargs):
+        """
+        Forward pass of the ElectraPre model.
+
+        Args:
+            data (dict): Input data.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            tuple: A tuple containing the raw generator output and discriminator output.
+            The generator output is a tensor of shape (batch_size, max_seq_len, vocab_size).
+            The discriminator output is a tensor of shape (batch_size, max_seq_len).
+
+        """
         features = data["features"]
         features = features.long()
         self.batch_size = batch_size = features.shape[0]
@@ -96,9 +133,35 @@ def filter_dict(d, filter_key):
 
 
 class Electra(ChebaiBaseNet):
+    """
+    Electra model implementation inherited from ChebaiBaseNet.
+
+    Args:
+        config (dict, optional): Configuration parameters for the Electra model. Defaults to None.
+        pretrained_checkpoint (str, optional): Path to the pretrained checkpoint file. Defaults to None.
+        load_prefix (str, optional): Prefix to filter the state_dict keys from the pretrained checkpoint. Defaults to None.
+        **kwargs: Additional keyword arguments.
+
+    Attributes:
+        NAME (str): Name of the Electra model.
+
+    """
+
     NAME = "Electra"
 
     def _process_batch(self, batch, batch_idx):
+        """
+        Process a batch of data.
+
+        Args:
+            batch (XYData): The input batch of data.
+            batch_idx (int): The index of the batch (not used).
+
+        Returns:
+            dict: A dictionary containing the processed batch, keys are `features`, `labels`, `model_kwargs`,
+                `loss_kwargs` and `idents`.
+
+        """
         model_kwargs = dict()
         loss_kwargs = batch.additional_fields["loss_kwargs"]
         if "lens" in batch.additional_fields["model_kwargs"]:
@@ -125,6 +188,13 @@ def _process_batch(self, batch, batch_idx):
 
     @property
     def as_pretrained(self):
+        """
+        Get the pretrained Electra model.
+
+        Returns:
+            ElectraModel: The pretrained Electra model.
+
+        """
         return self.electra.electra
 
     def __init__(
@@ -149,6 +219,8 @@ def __init__(
             nn.Dropout(self.config.hidden_dropout_prob),
             nn.Linear(in_d, self.config.num_labels),
         )
+
+        # Load pretrained checkpoint if provided
         if pretrained_checkpoint:
             with open(pretrained_checkpoint, "rb") as fin:
                 model_dict = torch.load(fin, map_location=self.device)
@@ -163,12 +235,36 @@ def __init__(
             self.electra = ElectraModel(config=self.config)
 
     def _process_for_loss(self, model_output, labels, loss_kwargs):
+        """
+        Process the model output for calculating the loss.
+
+        Args:
+            model_output (dict): The output of the model.
+            labels (Tensor): The target labels.
+            loss_kwargs (dict): Additional loss arguments.
+
+        Returns:
+            tuple: A tuple containing the processed model output, labels, and loss arguments.
+
+        """
         kwargs_copy = dict(loss_kwargs)
         if labels is not None:
             labels = labels.float()
         return model_output["logits"], labels, kwargs_copy
 
     def _get_prediction_and_labels(self, data, labels, model_output):
+        """
+        Get the predictions and labels from the model output. Applies a sigmoid to the model output.
+
+        Args:
+            data (dict): The input data.
+            labels (Tensor): The target labels.
+            model_output (dict): The output of the model.
+
+        Returns:
+            tuple: A tuple containing the predictions and labels.
+
+        """
         d = model_output["logits"]
         loss_kwargs = data.get("loss_kwargs", dict())
         if "non_null_labels" in loss_kwargs:
@@ -177,6 +273,16 @@ def _get_prediction_and_labels(self, data, labels, model_output):
         return torch.sigmoid(d), labels.int() if labels is not None else None
 
     def forward(self, data, **kwargs):
+        """
+        Forward pass of the Electra model.
+
+        Args:
+            data (dict): The input data (expects a key `features`).
+            **kwargs: Additional keyword arguments for `self.electra`.
+
+        Returns:
+            dict: A dictionary containing the model output (logits and attentions).
+        """
         self.batch_size = data["features"].shape[0]
         try:
             inp = self.electra.embeddings.forward(data["features"].int())
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -14,6 +14,48 @@
 
 
 class XYBaseDataModule(LightningDataModule):
+    """
+    Base class for data modules.
+
+    This class provides a base implementation for loading and preprocessing datasets.
+    It inherits from `LightningDataModule` and defines common properties and methods for data loading and processing.
+
+    Args:
+        batch_size (int): The batch size for data loading. Default is 1.
+        train_split (float): The ratio of training data to total data and of test data to (validation + test) data. Default is 0.85.
+        reader_kwargs (dict): Additional keyword arguments to be passed to the data reader. Default is None.
+        prediction_kind (str): The kind of prediction to be performed (only relevant for the predict_dataloader). Default is "test".
+        data_limit (int): The maximum number of data samples to load. If set to None, the complete dataset will be used. Default is None.
+        label_filter (int): The index of the label to filter. Default is None.
+        balance_after_filter (float): The ratio of negative samples to positive samples after filtering. Default is None.
+        num_workers (int): The number of worker processes for data loading. Default is 1.
+        chebi_version (int): The version of ChEBI to use. Default is 200.
+        inner_k_folds (int): The number of folds for inner cross-validation. Use -1 to disable inner cross-validation. Default is -1.
+        fold_index (int): The index of the fold to use for training and validation. Default is None.
+        base_dir (str): The base directory for storing processed and raw data. Default is None.
+        **kwargs: Additional keyword arguments.
+
+    Attributes:
+        READER (DataReader): The data reader class to use.
+        reader (DataReader): An instance of the data reader class.
+        train_split (float): The ratio of training data to total data.
+        batch_size (int): The batch size for data loading.
+        prediction_kind (str): The kind of prediction to be performed.
+        data_limit (int): The maximum number of data samples to load.
+        label_filter (int): The index of the label to filter.
+        balance_after_filter (float): The ratio of negative samples to positive samples after filtering.
+        num_workers (int): The number of worker processes for data loading.
+        chebi_version (int): The version of ChEBI to use.
+        inner_k_folds (int): The number of folds for inner cross-validation. If it is less than to, no cross-validation will be performed.
+        fold_index (int): The index of the fold to use for training and validation (only relevant for cross-validation)
+        _base_dir (str): The base directory for storing processed and raw data.
+        raw_dir (str): The directory for storing raw data.
+        processed_dir (str): The directory for storing processed data.
+        fold_dir (str): The name of the directory where the folds from inner cross-validation are stored.
+        _name (str): The name of the data module.
+
+    """
+
     READER = dr.DataReader
 
     def __init__(
@@ -69,10 +111,12 @@ def __init__(
 
     @property
     def identifier(self):
+        """Identifier for the dataset."""
         return (self.reader.name(),)
 
     @property
     def full_identifier(self):
+        """Full identifier for the dataset."""
         return (self._name, *self.identifier)
 
     @property
@@ -84,10 +128,12 @@ def base_dir(self):
 
     @property
     def processed_dir(self):
+        """name of dir where the processed data is stored"""
         return os.path.join(self.base_dir, "processed", *self.identifier)
 
     @property
     def raw_dir(self):
+        """name of dir where the raw data is stored"""
         return os.path.join(self.base_dir, "raw")
 
     @property
@@ -100,10 +146,24 @@ def _name(self):
         raise NotImplementedError
 
     def _filter_labels(self, row):
+        """filter labels based on label_filter"""
         row["labels"] = [row["labels"][self.label_filter]]
         return row
 
     def load_processed_data(self, kind: str = None, filename: str = None) -> List:
+        """
+        Load processed data from a file.
+
+        Args:
+            kind (str, optional): The kind of dataset to load such as "train", "val" or "test". Defaults to None.
+            filename (str, optional): The name of the file to load the dataset from. Defaults to None.
+
+        Returns:
+            List: The loaded processed data.
+
+        Raises:
+            ValueError: If both kind and filename are None.
+        """
         if kind is None and filename is None:
             raise ValueError(
                 "Either kind or filename is required to load the correct dataset, both are None"
@@ -123,6 +183,17 @@ def load_processed_data(self, kind: str = None, filename: str = None) -> List:
         return torch.load(os.path.join(self.processed_dir, filename))
 
     def dataloader(self, kind, **kwargs) -> DataLoader:
+        """
+        Returns a DataLoader object for the specified kind (train, val or test) of data.
+
+        Args:
+            kind (str): The kind indicates whether it is a train, val or test data to load.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            DataLoader: A DataLoader object.
+
+        """
         dataset = self.load_processed_data(kind)
         if "ids" in kwargs:
             ids = kwargs.pop("ids")
@@ -155,6 +226,15 @@ def dataloader(self, kind, **kwargs) -> DataLoader:
 
     @staticmethod
     def _load_dict(input_file_path):
+        """
+        Load data from a file and return a dictionary.
+
+        Args:
+            input_file_path (str): The path to the input file.
+
+        Yields:
+            dict: A dictionary containing the features and labels.
+        """
         with open(input_file_path, "r") as input_file:
             for row in input_file:
                 smiles, labels = row.split("\t")
@@ -166,6 +246,15 @@ def _get_data_size(input_file_path):
             return sum(1 for _ in f)
 
     def _load_data_from_file(self, path):
+        """
+        Load data from a file and return a list of dictionaries.
+
+        Args:
+            path (str): The path to the input file.
+
+        Returns:
+            List: A list of dictionaries containing the features and labels.
+        """
         lines = self._get_data_size(path)
         print(f"Processing {lines} lines...")
         data = [
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py