review docstrings

sfluegel · sfluegel · commit 5c2914a61a23 · 2024-04-02T15:08:56.000+02:00
diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -115,7 +115,7 @@ def _execute(self, batch, batch_idx, metrics, prefix="", log=True, sync_dist=Fal
             sync_dist (bool, optional): Whether to synchronize distributed training. Defaults to False.
 
         Returns:
-            dict: A dictionary containing the processed data, labels, model output, predictions, and loss (if applicable).
+            dict: A dictionary containing the processed data, labels, model_output, predictions, and loss (if applicable).
         """
         assert isinstance(batch, XYData)
         batch = batch.to(self.device)
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -23,11 +23,11 @@
 
 class ElectraPre(ChebaiBaseNet):
     """
-    ElectraPre class represents a pre-trained Electra model for pre-training inherited from ChebaiBaseNet.
+    ElectraPre class represents an Electra model for pre-training inherited from ChebaiBaseNet.
 
     Args:
         config (dict): Configuration parameters for the Electra model.
-        **kwargs: Additional keyword arguments.
+        **kwargs: Additional keyword arguments (passed to parent class).
 
     Attributes:
         NAME (str): Name of the ElectraPre model.
@@ -155,10 +155,11 @@ def _process_batch(self, batch, batch_idx):
 
         Args:
             batch (XYData): The input batch of data.
-            batch_idx (int): The index of the batch.
+            batch_idx (int): The index of the batch (not used).
 
         Returns:
-            dict: A dictionary containing the processed batch.
+            dict: A dictionary containing the processed batch, keys are `features`, `labels`, `model_kwargs`,
+                `loss_kwargs` and `idents`.
 
         """
         model_kwargs = dict()
@@ -253,7 +254,7 @@ def _process_for_loss(self, model_output, labels, loss_kwargs):
 
     def _get_prediction_and_labels(self, data, labels, model_output):
         """
-        Get the predictions and labels from the model output.
+        Get the predictions and labels from the model output. Applies a sigmoid to the model output.
 
         Args:
             data (dict): The input data.
@@ -276,12 +277,11 @@ def forward(self, data, **kwargs):
         Forward pass of the Electra model.
 
         Args:
-            data (dict): The input data.
-            **kwargs: Additional keyword arguments.
+            data (dict): The input data (expects a key `features`).
+            **kwargs: Additional keyword arguments for `self.electra`.
 
         Returns:
-            dict: A dictionary containing the model output.
-
+            dict: A dictionary containing the model output (logits and attentions).
         """
         self.batch_size = data["features"].shape[0]
         try:
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -22,14 +22,14 @@ class XYBaseDataModule(LightningDataModule):
 
     Args:
         batch_size (int): The batch size for data loading. Default is 1.
-        train_split (float): The ratio of training data to total data. Default is 0.85.
+        train_split (float): The ratio of training data to total data and of test data to (validation + test) data. Default is 0.85.
         reader_kwargs (dict): Additional keyword arguments to be passed to the data reader. Default is None.
-        prediction_kind (str): The kind of prediction to be performed. Default is "test".
-        data_limit (int): The maximum number of data samples to load. Default is None.
+        prediction_kind (str): The kind of prediction to be performed (only relevant for the predict_dataloader). Default is "test".
+        data_limit (int): The maximum number of data samples to load. If set to None, the complete dataset will be used. Default is None.
         label_filter (int): The index of the label to filter. Default is None.
         balance_after_filter (float): The ratio of negative samples to positive samples after filtering. Default is None.
         num_workers (int): The number of worker processes for data loading. Default is 1.
-        chebi_version (int): The version of ChEBI database to use. Default is 200.
+        chebi_version (int): The version of ChEBI to use. Default is 200.
         inner_k_folds (int): The number of folds for inner cross-validation. Use -1 to disable inner cross-validation. Default is -1.
         fold_index (int): The index of the fold to use for training and validation. Default is None.
         base_dir (str): The base directory for storing processed and raw data. Default is None.
@@ -45,9 +45,9 @@ class XYBaseDataModule(LightningDataModule):
         label_filter (int): The index of the label to filter.
         balance_after_filter (float): The ratio of negative samples to positive samples after filtering.
         num_workers (int): The number of worker processes for data loading.
-        chebi_version (int): The version of ChEBI database to use.
-        inner_k_folds (int): The number of folds for inner cross-validation.
-        fold_index (int): The index of the fold to use for training and validation.
+        chebi_version (int): The version of ChEBI to use.
+        inner_k_folds (int): The number of folds for inner cross-validation. If it is less than to, no cross-validation will be performed.
+        fold_index (int): The index of the fold to use for training and validation (only relevant for cross-validation)
         _base_dir (str): The base directory for storing processed and raw data.
         raw_dir (str): The directory for storing raw data.
         processed_dir (str): The directory for storing processed data.
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -114,9 +114,11 @@ class _ChEBIDataExtractor(XYBaseDataModule, ABC):
     A class for extracting and processing data from the ChEBI dataset.
 
     Args:
-        chebi_version_train (int, optional): The version of ChEBI to use for training and validation. Defaults to None.
-        single_class (int, optional): The ID of the single class to predict. Defaults to None.
-        **kwargs: Additional keyword arguments.
+        chebi_version_train (int, optional): The version of ChEBI to use for training and validation. If not set,
+            chebi_version will be used for training, validation and test. Defaults to None.
+        single_class (int, optional): The ID of the single class to predict. If not set, all available labels will be
+            predicted. Defaults to None.
+        **kwargs: Additional keyword arguments (passed to XYBaseDataModule).
 
     Attributes:
         single_class (int): The ID of the single class to predict.
@@ -135,10 +137,10 @@ def __init__(
 
     def extract_class_hierarchy(self, chebi_path):
         """
-        Extract the class hierarchy from the ChEBI dataset.
+        Extracts the class hierarchy from the ChEBI ontology.
 
         Args:
-            chebi_path (str): The path to the ChEBI dataset.
+            chebi_path (str): The path to the ChEBI ontology.
 
         Returns:
             nx.DiGraph: The class hierarchy.
@@ -200,7 +202,7 @@ def _load_dict(self, input_file_path):
             input_file_path (str): The path to the file.
 
         Yields:
-            dict: The dictionary.
+            dict: The dictionary, keys are `features`, `labels` and `ident`.
         """
         with open(input_file_path, "rb") as input_file:
             df = pickle.load(input_file)
@@ -434,7 +436,7 @@ def prepare_data(self, *args, **kwargs):
         Prepares the data for the Chebi dataset.
 
         This method checks for the presence of raw data in the specified directory.
-        If the raw data is missing, it fetches the data and creates the test set.
+        If the raw data is missing, it fetches the ontology and creates a test test set.
         If the test set already exists, it loads it from the file.
         Then, it creates the train/validation split based on the test set.
 
@@ -532,8 +534,8 @@ def select_classes(self, g, split_name, *args, **kwargs):
         Args:
             g (Graph): The graph representing the dataset.
             split_name (str): The name of the split.
-            *args: Additional arguments.
-            **kwargs: Additional keyword arguments.
+            *args: Additional arguments (not used).
+            **kwargs: Additional keyword arguments (not used).
 
         Returns:
             list: The list of selected classes.