diff --git a/.gitignore b/.gitignore
index ea0ec8a..34f4ab7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+testing_preprocessing.py
 # C extensions
 *.so
 *.dll
diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py
index c776ff6..a9e04ad 100644
--- a/clustpy/deep/_abstract_deep_clustering_algo.py
+++ b/clustpy/deep/_abstract_deep_clustering_algo.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from clustpy.deep._utils import set_torch_seed
 from sklearn.base import TransformerMixin, BaseEstimator, ClusterMixin
 import numpy as np
@@ -37,6 +38,20 @@ def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neu
         self.embedding_size = embedding_size
         self.device = device
         self.random_state = random_state
+        self.history_ = defaultdict(list)
+    
+    def _log_history(self, key: str, value) -> None:
+        """
+        Log pretraining and clustering history values (e.g. loss values) during training.
+
+        Parameters
+        ----------
+        key : str
+            the key under which to store the value
+        value : float
+
+        """
+        self.history_[key].append(float(value))
 
     def _check_parameters(self, X: np.ndarray, *, y: np.ndarray=None) -> (np.ndarray, np.ndarray, np.random.RandomState, dict, dict, dict):
         """
diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py
index e776d96..69f6389 100644
--- a/clustpy/deep/_train_utils.py
+++ b/clustpy/deep/_train_utils.py
@@ -105,13 +105,15 @@ def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network:
     return neural_network
 
 
-def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: np.ndarray = None,
+def get_trained_network(trainloader: torch.utils.data.DataLoader = None, evalloader: torch.utils.data.DataLoader | None = None, data: np.ndarray = None,
                         n_epochs: int = 100, batch_size: int = 128, optimizer_params: dict = None,
                         optimizer_class: torch.optim.Optimizer = torch.optim.Adam, device=None,
                         ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10,
                         neural_network: torch.nn.Module | tuple = None,
                         neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
-                        neural_network_params: dict = None, neural_network_weights: str = None,
+                        neural_network_params: dict = None,
+                        log_fn: Callable[[str, float], None] = None,
+                        neural_network_weights: str = None,
                         random_state: np.random.RandomState | int = None) -> torch.nn.Module:
     """This function returns a trained neural network. The following cases are considered
        - If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again.
@@ -123,6 +125,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
     ----------
     trainloader : torch.utils.data.DataLoader
         dataloader used to train neural_network (default: None)
+    evalloader : torch.utils.data.DataLoader | None
+        dataloader used for earlystopping during training (default: None)
     data : np.ndarray
         train data set. If data is passed then trainloader can remain empty (default: None)
     n_epochs : int
@@ -147,6 +151,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
         The neural network class that should be used (default: FeedforwardAutoencoder)
     neural_network_params : dict
         Parameters to be used when creating a new neural network using the neural_network_class (default: None)
+    log_fn : Callable[[str, float], None]
+        Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters.
     neural_network_weights : str
         Path to a file containing the state_dict of the neural_network (default: None)
     random_state : np.random.RandomState | int
@@ -170,12 +176,12 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
         print("Neural network is not fitted yet, will be pretrained.")
         # Pretrain neural network
         optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params
-        neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader,
-                           optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn)
+        neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, evalloader=evalloader,
+                        optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn,log_fn=log_fn)
     return neural_network
 
 
-def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_clusters: int, batch_size: int,
+def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, val_set: np.ndarray | torch.Tensor | None,  n_clusters: int, batch_size: int,
                                                pretrain_optimizer_params: dict, pretrain_epochs: int,
                                                optimizer_class: torch.optim.Optimizer,
                                                ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
@@ -185,6 +191,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
                                                random_state: np.random.RandomState,
                                                neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
                                                neural_network_params: dict = None,
+                                               log_fn: Callable[[str, float], None] = None,
                                                neural_network_weights: str = None) -> (
         torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int,
         np.ndarray, np.ndarray, ClusterMixin):
@@ -196,6 +203,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
     ----------
     X : np.ndarray | torch.Tensor
         the given data set. Can be a np.ndarray or a torch.Tensor
+    val_set : np.ndarray | torch.Tensor | None
+        validation data set. Can be a np.ndarray or a torch.Tensor. If None, no validation set will be used
     n_clusters : int
         number of clusters. Can be None if a corresponding initial_clustering_class is given, e.g. DBSCAN
     batch_size : int
@@ -231,6 +240,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
         The neural network class that should be used (default: FeedforwardAutoencoder)
     neural_network_params : dict
         Parameters to be used when creating a new neural network using the neural_network_class (default: None)
+    log_fn : Callable[[str, float], None]
+        Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters.
     neural_network_weights : str
         Path to a file containing the state_dict of the neural_network (default: None)
 
@@ -250,12 +261,17 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
     """
     device = detect_device(device)
     trainloader, testloader, batch_size = get_train_and_test_dataloader(X, batch_size, custom_dataloaders)
-    neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs,
+    if val_set is not None:
+        evalloader = get_dataloader(val_set, batch_size, shuffle=True)
+    else:   
+        evalloader = None
+    neural_network = get_trained_network(trainloader, evalloader= evalloader, n_epochs=pretrain_epochs,
                                          optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class,
                                          device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size,
                                          neural_network=neural_network, neural_network_class=neural_network_class,
                                          neural_network_params=neural_network_params,
                                          neural_network_weights=neural_network_weights,
+                                         log_fn=log_fn,
                                          random_state=random_state)
     # Execute initial clustering in embedded space
     embedded_data = encode_batchwise(testloader, neural_network)
diff --git a/clustpy/deep/aec.py b/clustpy/deep/aec.py
index fcf6e61..6ebb6ff 100644
--- a/clustpy/deep/aec.py
+++ b/clustpy/deep/aec.py
@@ -14,14 +14,14 @@
 from collections.abc import Callable
 
 
-def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
+def _aec(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
          neural_network: torch.nn.Module | tuple, neural_network_weights: str,
          embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
          custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
          initial_clustering_params: dict, device: torch.device,
-         random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
+         log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
     """
     Start the actual AEC clustering procedure on the input data set.
 
@@ -31,6 +31,8 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
         the given data set. Can be a np.ndarray or a torch.Tensor
     n_clusters : int
         number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
+    val_set : np.ndarray | None
+        Optional validation set for early stopping. If not None, Early stopping will be used    
     batch_size : int
         size of the data batches
     pretrain_optimizer_params : dict
@@ -70,9 +72,13 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
         parameters for the initial clustering class
     device : torch.device
         The device on which to perform the computations
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
     random_state : np.random.RandomState
         use a fixed random state to get a repeatable solution
 
+
+
     Returns
     -------
     tuple : (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module)
@@ -82,11 +88,11 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
     """
     # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
     device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization(
-        X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
+        X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
         neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
-        random_state, neural_network_weights=neural_network_weights)
+        random_state, log_fn=log_fn, neural_network_weights=neural_network_weights)
     # Setup AEC Module
-    aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance).to_device(device)
+    aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device)
     # Use AEC optimizer parameters (usually learning rate is reduced by a magnitude of 10)
     optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params)
     # AEC Training loop
@@ -120,11 +126,13 @@ class _AEC_Module(_DCN_Module):
         the cluster centers
     augmentation_invariance : bool
         Is augmentation invariance used
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
     """
 
     def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray,
-                 augmentation_invariance: bool = False):
-        super().__init__(init_np_labels, init_np_centers, augmentation_invariance)
+                 augmentation_invariance: bool = False, log_fn: Callable | None = None):
+        super().__init__(init_np_labels, init_np_centers, augmentation_invariance,log_fn)
 
     def update_centroids(self, embedded: np.ndarray, labels: np.ndarray) -> torch.Tensor:
         """
@@ -188,14 +196,18 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
         for _ in tbar:
             # Update Network
             total_loss = 0
+            total_ssl_loss = 0 
+            total_clustering_loss = 0
             for batch in trainloader:
                 # Beware that the clustering loss of DCN is divided by 2, therefore we use 2 * clustering_loss_weight
                 loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight,
                                   2 * clustering_loss_weight, device)
-                total_loss += loss.item()
+                total_loss += loss[0].item()
+                total_ssl_loss += loss[1].item()
+                total_clustering_loss += loss[2].item()
                 # Backward pass - update weights
                 optimizer.zero_grad()
-                loss.backward()
+                loss[0].backward()
                 optimizer.step()
             postfix_str = {"Loss": total_loss}
             tbar.set_postfix(postfix_str)
@@ -207,6 +219,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
             # update assignments
             labels = self.predict_hard(torch.tensor(embedded).to(device))
             self.labels = labels.to(device)
+            if self.log_fn is not None:
+                self.log_fn("Total Loss", total_loss)
+                self.log_fn("SSL Loss", total_ssl_loss)
+                self.log_fn("Clustering Loss", total_clustering_loss)
         return self
 
 
@@ -315,7 +331,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
         self.initial_clustering_class = initial_clustering_class
         self.initial_clustering_params = initial_clustering_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'AEC':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -333,7 +349,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC':
             this instance of the AEC algorithm
         """
         X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y)
-        aec_labels, aec_centers, neural_network = _aec(X, self.n_clusters, self.batch_size,
+        aec_labels, aec_centers, neural_network = _aec(X, val_set, self.n_clusters, self.batch_size,
                                                        pretrain_optimizer_params,
                                                        clustering_optimizer_params,
                                                        self.pretrain_epochs,
@@ -349,6 +365,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC':
                                                        self.initial_clustering_class,
                                                        initial_clustering_params,
                                                        self.device,
+                                                       self._log_history,
                                                        random_state)
         self.labels_ = aec_labels
         self.cluster_centers_ = aec_centers
diff --git a/clustpy/deep/dcn.py b/clustpy/deep/dcn.py
index 7fcf906..f3a1133 100644
--- a/clustpy/deep/dcn.py
+++ b/clustpy/deep/dcn.py
@@ -15,14 +15,15 @@
 from collections.abc import Callable
 
 
-def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
+def _dcn(X: np.ndarray, val_Set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
          neural_network: torch.nn.Module | tuple, neural_network_weights: str,
          embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
          custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
          initial_clustering_params: dict, device: torch.device,
-         random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
+         random_state: np.random.RandomState,
+         log_fn: Callable | None) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
     """
     Start the actual DCN clustering procedure on the input data set.
 
@@ -32,6 +33,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
         the given data set. Can be a np.ndarray or a torch.Tensor
     n_clusters : int
         number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
+    val_set : np.ndarray | None
+        Optional validation set for early stopping. If not None, Early stopping will be used    
     batch_size : int
         size of the data batches
     pretrain_optimizer_params : dict
@@ -71,6 +74,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
         parameters for the initial clustering class
     device : torch.device
         The device on which to perform the computations
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
     random_state : np.random.RandomState
         use a fixed random state to get a repeatable solution
 
@@ -85,11 +90,11 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
     """
     # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
     device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization(
-        X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
+        X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
         neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
-        random_state, neural_network_weights=neural_network_weights)
+        random_state, log_fn=log_fn, neural_network_weights=neural_network_weights)
     # Setup DCN Module
-    dcn_module = _DCN_Module(init_labels, init_centers, augmentation_invariance).to_device(device)
+    dcn_module = _DCN_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device)
     # Use DCN optimizer parameters (usually learning rate is reduced by a magnitude of 10)
     optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params)
     # DEC Training loop
@@ -152,6 +157,8 @@ class _DCN_Module(torch.nn.Module):
     augmentation_invariance : bool
         If True, augmented samples provided in custom_dataloaders[0] will be used to learn
         cluster assignments that are invariant to the augmentation transformations (default: False)
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
 
     Attributes
     ----------
@@ -163,7 +170,7 @@ class _DCN_Module(torch.nn.Module):
         Is augmentation invariance used
     """
 
-    def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augmentation_invariance: bool = False):
+    def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augmentation_invariance: bool = False,log_fn: Callable | None=None):
         super().__init__()
         self.augmentation_invariance = augmentation_invariance
         self.labels = torch.from_numpy(init_np_labels)
@@ -171,6 +178,7 @@ def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augm
         # Init for count from original DCN code (not reported in Paper)
         # This means centroid learning rate at the beginning is scaled by a hundred
         self.counts = torch.ones(self.centers.shape[0], dtype=torch.int32) * 100
+        self.log_fn = log_fn
 
     def dcn_loss(self, embedded: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
         """
@@ -188,7 +196,7 @@ def dcn_loss(self, embedded: torch.Tensor, labels: torch.Tensor) -> torch.Tensor
         loss: torch.Tensor
             the final DCN loss
         """
-        loss = (embedded - self.centers[labels]).pow(2).sum() / embedded.shape[0]
+        loss = (embedded - self.centers[labels.long()]).pow(2).sum() / embedded.shape[0]
         return loss
 
     def predict_hard(self, embedded: torch.Tensor) -> torch.Tensor:
@@ -266,6 +274,8 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla
             weight of the clustering loss
         ssl_loss_weight : float
             weight of the self-supervised learning (ssl) loss
+        log_fn : Callable | None
+            function for logging training history values (e.g. loss values) during training
         device : torch.device
             device to be trained on
 
@@ -290,8 +300,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla
 
         # compute total loss
         loss = ssl_loss_weight * ssl_loss + 0.5 * clustering_loss_weight * cluster_loss
-
-        return loss
+        return loss, ssl_loss, cluster_loss
 
     def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader,
             testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device,
@@ -331,13 +340,17 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
         for _ in tbar:
             # Update Network
             total_loss = 0
+            ssl_loss = 0 
+            clustering_loss = 0
             for batch in trainloader:
                 loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight, clustering_loss_weight,
                                   device)
-                total_loss += loss.item()
+                total_loss += loss[0].item()
+                ssl_loss += loss[1].item()
+                clustering_loss += loss[2].item()
                 # Backward pass - update weights
                 optimizer.zero_grad()
-                loss.backward()
+                loss[0].backward()
                 optimizer.step()
                 # Update Assignments and Centroids
                 with torch.no_grad():
@@ -362,6 +375,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
                     self.counts = counts
             postfix_str = {"Loss": total_loss}
             tbar.set_postfix(postfix_str)
+            if self.log_fn is not None:
+                self.log_fn("total_loss", total_loss)
+                self.log_fn("ssl_loss", ssl_loss)
+                self.log_fn("clustering_loss", clustering_loss)
         return self
 
 
@@ -473,7 +490,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
         self.initial_clustering_class = initial_clustering_class
         self.initial_clustering_params = initial_clustering_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DCN':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -491,7 +508,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN':
             this instance of the DCN algorithm
         """
         X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y)
-        kmeans_labels, kmeans_centers, dcn_labels, dcn_centers, neural_network = _dcn(X, self.n_clusters,
+        kmeans_labels, kmeans_centers, dcn_labels, dcn_centers, neural_network = _dcn(X, val_Set, self.n_clusters,
                                                                                       self.batch_size,
                                                                                       pretrain_optimizer_params,
                                                                                       clustering_optimizer_params,
@@ -509,7 +526,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN':
                                                                                       self.initial_clustering_class,
                                                                                       initial_clustering_params,
                                                                                       self.device,
-                                                                                      random_state)
+                                                                                      random_state,
+                                                                                      log_fn=self._log_history)
         self.labels_ = kmeans_labels
         self.cluster_centers_ = kmeans_centers
         self.dcn_labels_ = dcn_labels
diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py
index 6cde45b..e1dd995 100644
--- a/clustpy/deep/ddc_n2d.py
+++ b/clustpy/deep/ddc_n2d.py
@@ -6,7 +6,7 @@
 import torch
 import numpy as np
 from clustpy.deep._utils import detect_device, encode_batchwise, run_initial_clustering, mean_squared_error
-from clustpy.deep._data_utils import get_train_and_test_dataloader
+from clustpy.deep._data_utils import get_train_and_test_dataloader, get_dataloader
 from clustpy.deep._train_utils import get_trained_network
 from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo
 from sklearn.manifold import TSNE
@@ -18,13 +18,13 @@
 from clustpy.utils.checks import check_parameters
 
 
-def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
+def _manifold_based_sequential_dc(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
                                   pretrain_epochs: int, optimizer_class: torch.optim.Optimizer,
                                   ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple,
                                   neural_network_weights: str, embedding_size: int, custom_dataloaders: tuple,
                                   manifold_class: TransformerMixin, manifold_params: dict,
                                   clustering_class: ClusterMixin, clustering_params: dict, device: torch.device,
-                                  random_state: np.random.RandomState) -> (
+                                  random_state: np.random.RandomState, log_fn: Callable | None ) -> (
         int, np.ndarray, np.ndarray, torch.nn.Module, TransformerMixin):
     """
     Execute a manifold-based sequential deep clustering procedure on the input data set.
@@ -33,6 +33,8 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in
     ----------
     X : np.ndarray / torch.Tensor
         the given data set. Can be a np.ndarray or a torch.Tensor
+    val_set : np.ndarray / torch.Tensor | None
+        validation set (can be ignored)
     n_clusters : int
         number of clusters (can be None)
     batch_size : int
@@ -69,6 +71,8 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in
         The device on which to perform the computations
     random_state : np.random.RandomState
         use a fixed random state to get a repeatable solution
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
 
     Returns
     -------
@@ -83,12 +87,16 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in
     # Get the device to train on
     device = detect_device(device)
     trainloader, testloader, _ = get_train_and_test_dataloader(X, batch_size, custom_dataloaders)
+    if val_set is not None:
+        valloader = get_dataloader(val_set, batch_size, shuffle=False)
+    else:
+        valloader = None
     # Get initial AE
-    neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs,
+    neural_network = get_trained_network(trainloader,valloader, n_epochs=pretrain_epochs,
                                          optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class,
                                          device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size,
                                          neural_network=neural_network, neural_network_weights=neural_network_weights,
-                                         random_state=random_state)
+                                         log_fn=log_fn, random_state=random_state)
     # Encode data
     X_embed = encode_batchwise(testloader, neural_network)
     # Get possible input parameters of the manifold class
@@ -135,7 +143,7 @@ class DDC_density_peak_clustering(ClusterMixin, BaseEstimator):
     def __init__(self, ratio: float):
         self.ratio = ratio
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC_density_peak_clustering':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DDC_density_peak_clustering':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -308,13 +316,14 @@ class DDC(_AbstractDeepClusteringAlgo):
     Knowledge-Based Systems 197 (2020): 105841.
     """
 
-    def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None,
+    def __init__(self,n_clusters: int = None, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None,
                  pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
                  neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
                  embedding_size: int = 10, custom_dataloaders: tuple = None, tsne_params: dict = None,
                  device: torch.device = None, random_state: np.random.RandomState | int = None):
         super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state)
+        self.n_clusters = n_clusters
         self.ratio = ratio
         self.pretrain_optimizer_params = pretrain_optimizer_params
         self.pretrain_epochs = pretrain_epochs
@@ -323,7 +332,7 @@ def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer
         self.custom_dataloaders = custom_dataloaders
         self.tsne_params = tsne_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DDC':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -344,7 +353,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC':
         tsne_params = {"n_components": 2} if self.tsne_params is None else self.tsne_params
         if self.ratio > 1:
             print("[WARNING] ratio for DDC algorithm has been set to a value > 1 which can cause poor results")
-        n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X, None, self.batch_size,
+        n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X,val_set, self.n_clusters, self.batch_size,
                                                                                     pretrain_optimizer_params,
                                                                                     self.pretrain_epochs,
                                                                                     self.optimizer_class,
@@ -356,7 +365,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC':
                                                                                     tsne_params,
                                                                                     DDC_density_peak_clustering,
                                                                                     {"ratio": self.ratio}, self.device,
-                                                                                    random_state)
+                                                                                    random_state,self._log_history)
         self.labels_ = labels
         self.n_clusters_ = n_clusters
         self.cluster_centers_ = centers_ae
@@ -470,7 +479,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
         self.manifold_params = manifold_params
         self.initial_clustering_params = initial_clustering_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'N2D':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -479,6 +488,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D':
         ----------
         X : np.ndarray
             the given data set
+        val_set : np.ndarray
+            validation set (can be ignored)
         y : np.ndarray
             the labels (can be ignored)
 
@@ -489,7 +500,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D':
         """
         X, _, random_state, pretrain_optimizer_params, _, initial_clustering_params = self._check_parameters(X, y=y)
         manifold_params = {"n_components": self.n_clusters} if self.manifold_params is None else self.manifold_params
-        _, labels, centers_ae, centers_manifold, neural_network, manifold = _manifold_based_sequential_dc(X, self.n_clusters,
+        _, labels, centers_ae, centers_manifold, neural_network, manifold = _manifold_based_sequential_dc(X, val_set, self.n_clusters,
                                                                                               self.batch_size,
                                                                                               pretrain_optimizer_params,
                                                                                               self.pretrain_epochs,
@@ -503,7 +514,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D':
                                                                                               manifold_params,
                                                                                               GMM, initial_clustering_params, 
                                                                                               self.device,
-                                                                                              random_state)
+                                                                                              random_state,
+                                                                                              self._log_history)
         self.labels_ = labels.astype(np.int32)
         self.cluster_centers_manifold_ = centers_manifold
         self.cluster_centers_ = centers_ae
diff --git a/clustpy/deep/dec.py b/clustpy/deep/dec.py
index a2501fb..342ec81 100644
--- a/clustpy/deep/dec.py
+++ b/clustpy/deep/dec.py
@@ -16,13 +16,14 @@
 from collections.abc import Callable#
 
 
-def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict,
+def _dec(X: np.ndarray,val_set: np.ndarray | None, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
          neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int,
          clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple,
          augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict,
-         device: torch.device, random_state: np.random.RandomState) -> (
+         device: torch.device, random_state: np.random.RandomState,
+         log_fn: Callable | None) -> (
         np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
     """
     Start the actual DEC clustering procedure on the input data set.
@@ -31,6 +32,8 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain
     ----------
     X : np.ndarray / torch.Tensor
         the given data set. Can be a np.ndarray or a torch.Tensor
+    val_set : np.ndarray | None
+        Optional validation set for early stopping. If not None, Early stopping will be used
     n_clusters : int
         number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
     alpha : float
@@ -88,11 +91,11 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain
     """
     # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
     device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization(
-        X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
+        X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
         neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
-        random_state, neural_network_weights=neural_network_weights)
+        random_state, log_fn=log_fn, neural_network_weights=neural_network_weights)
     # Setup DEC Module
-    dec_module = _DEC_Module(init_centers, alpha, augmentation_invariance).to(device)
+    dec_module = _DEC_Module(init_centers, alpha, augmentation_invariance,log_fn).to(device)
     # Use DEC optimizer parameters (usually learning rate is reduced by a magnitude of 10)
     optimizer = optimizer_class(list(neural_network.parameters()) + list(dec_module.parameters()),
                                 **clustering_optimizer_params)
@@ -205,10 +208,11 @@ class _DEC_Module(torch.nn.Module):
         Is augmentation invariance used
     """
 
-    def __init__(self, init_centers: np.ndarray, alpha: float, augmentation_invariance: bool = False):
+    def __init__(self, init_centers: np.ndarray, alpha: float, augmentation_invariance: bool = False,log_fn: Callable | None = None):
         super().__init__()
         self.alpha = alpha
         self.augmentation_invariance = augmentation_invariance
+        self.log_fn = log_fn
         # Centers are learnable parameters
         self.centers = torch.nn.Parameter(torch.tensor(init_centers), requires_grad=True)
 
@@ -331,6 +335,8 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, clustering_loss_we
             the final DEC loss
         """
         loss = torch.tensor(0.).to(device)
+        ssl_loss = torch.tensor(0.).to(device)
+        cluster_loss = torch.tensor(0.).to(device)
         # Reconstruction loss is not included in DEC
         if ssl_loss_weight != 0:
             if self.augmentation_invariance:
@@ -355,7 +361,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, clustering_loss_we
             cluster_loss = self.dec_loss(embedded)
         loss += cluster_loss * clustering_loss_weight
 
-        return loss
+        return loss, ssl_loss, cluster_loss
 
     def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, n_epochs: int,
             device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
@@ -390,16 +396,26 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
         tbar = tqdm.trange(n_epochs, desc="DEC training")
         for _ in tbar:
             total_loss = 0
+            total_ssl_loss = 0
+            total_cluster_loss = 0
             for batch in trainloader:
                 loss = self._loss(batch, neural_network, clustering_loss_weight, ssl_loss_weight, ssl_loss_fn,
                                   device)
-                total_loss += loss.item()
+                total_loss += loss[0].item()
+                total_ssl_loss += loss[1].item() if ssl_loss_weight != 0 else 0
+                total_cluster_loss += loss[2].item()
+
                 # Backward pass
                 optimizer.zero_grad()
-                loss.backward()
+                loss[0].backward()
                 optimizer.step()
             postfix_str = {"Loss": total_loss}
             tbar.set_postfix(postfix_str)
+            if self.log_fn is not None:
+                self.log_fn("Total Loss", total_loss)
+                if ssl_loss_weight != 0:
+                    self.log_fn("SSL Loss", total_ssl_loss)
+                    self.log_fn("Clustering Loss", total_cluster_loss)
         return self
 
 
@@ -511,7 +527,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25
         self.initial_clustering_class = initial_clustering_class
         self.initial_clustering_params = initial_clustering_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC':
+    def fit(self, X: np.ndarray,val_set: np.ndarray | None = None, y: np.ndarray = None) -> 'DEC':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -520,6 +536,9 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC':
         ----------
         X : np.ndarray
             the given data set
+        val_set : np.ndarray | None
+            optional validation set for monitoring purposes (can be ignored)
+
         y : np.ndarray
             the labels (can be ignored)
 
@@ -530,7 +549,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC':
         """
         ssl_loss_weight = self.ssl_loss_weight if hasattr(self, "ssl_loss_weight") else 0 # DEC does not use ssl loss when clustering
         X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y)
-        kmeans_labels, kmeans_centers, dec_labels, dec_centers, neural_network = _dec(X, self.n_clusters, self.alpha,
+        kmeans_labels, kmeans_centers, dec_labels, dec_centers, neural_network = _dec(X,val_set, self.n_clusters, self.alpha,
                                                                                       self.batch_size,
                                                                                       pretrain_optimizer_params,
                                                                                       clustering_optimizer_params,
@@ -547,7 +566,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC':
                                                                                       self.augmentation_invariance,
                                                                                       self.initial_clustering_class,
                                                                                       initial_clustering_params,
-                                                                                      self.device, random_state)
+                                                                                      self.device, random_state,
+                                                                                      log_fn=self._log_history)
         self.labels_ = kmeans_labels
         self.cluster_centers_ = kmeans_centers
         self.dec_labels_ = dec_labels
diff --git a/clustpy/deep/deepect.py b/clustpy/deep/deepect.py
index 028afbb..364ec2e 100644
--- a/clustpy/deep/deepect.py
+++ b/clustpy/deep/deepect.py
@@ -79,10 +79,12 @@ class _DeepECT_Module(torch.nn.Module):
     augmentation_invariance : bool
         If True, augmented samples provided in custom_dataloaders[0] will be used to learn
         cluster assignments that are invariant to the augmentation transformations (default: False)
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
     """
 
     def __init__(self, cluster_tree: BinaryClusterTree, max_n_leaf_nodes: int, grow_interval: int,
-                 pruning_threshold: float, augmentation_invariance: bool = False):
+                 pruning_threshold: float, augmentation_invariance: bool = False,log_fn: Callable | None = None):
         super().__init__()
         # Create initial cluster tree
         self.cluster_tree = cluster_tree
@@ -90,6 +92,7 @@ def __init__(self, cluster_tree: BinaryClusterTree, max_n_leaf_nodes: int, grow_
         self.grow_interval = grow_interval
         self.pruning_threshold = pruning_threshold
         self.augmentation_invariance = augmentation_invariance
+        self.log_fn = log_fn
 
     def predict_hard(self, embedded: torch.Tensor) -> torch.Tensor:
         """
@@ -137,7 +140,7 @@ def _get_labels_from_leafs(self, embedded: torch.Tensor, leaf_nodes: list) -> (
         leaf_labels = torch.stack([leaf.torch_labels[0] for leaf in leaf_nodes])
         # Get distances between points and centers. Get nearest center
         squared_diffs = squared_euclidean_distance(embedded, leaf_centers)
-        cluster_center_assignments = (squared_diffs.min(dim=1)[1]).int()
+        cluster_center_assignments = (squared_diffs.min(dim=1)[1]).long()
         labels = leaf_labels[cluster_center_assignments]
         return leaf_centers, cluster_center_assignments, labels
 
@@ -372,7 +375,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla
         dc_loss = self._data_compression_loss(embedded, split_nodes, labels, device, embedded_aug)
         # Combine losses
         loss = clustering_loss_weight * (nc_loss + dc_loss) + ssl_loss_weight * ssl_loss
-        return loss, labels
+        return (loss, ssl_loss, nc_loss+dc_loss), labels
 
     def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader,
             testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device,
@@ -415,6 +418,8 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
         for epoch in tbar:
             # Update Network
             total_loss = 0
+            total_ssl_loss = 0
+            total_clust_loss = 0
             with torch.no_grad():
                 # Grow tree
                 if (epoch % self.grow_interval == 0 or self.cluster_tree.n_leaf_nodes_ < 2) and len(
@@ -426,10 +431,12 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
                 # Calculate loss
                 loss, labels = self._loss(batch, neural_network, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight,
                                           leaf_nodes, split_nodes, device)
-                total_loss += loss.item()
+                total_loss += loss[0].item()
+                total_ssl_loss += loss[1].item()
+                total_clust_loss += loss[2].item()
                 # Backward pass - update weights
                 optimizer.zero_grad()
-                loss.backward()
+                loss[0].backward()
                 optimizer.step()
                 # Adapt centers and weights of split nodes analytically
                 with torch.no_grad():
@@ -440,16 +447,20 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
                         leaf_nodes, split_nodes = self.cluster_tree.get_leaf_and_split_nodes()
             postfix_str = {"Loss": total_loss}
             tbar.set_postfix(postfix_str)
+            if self.log_fn is not None:
+                self.log_fn("Total Loss", total_loss)
+                self.log_fn("SSL Loss", total_ssl_loss)
+                self.log_fn("Clustering Loss", total_clust_loss)
         return self
 
 
-def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_optimizer_params: dict,
+def _deep_ect(X: np.ndarray, val_set: np.ndarray | None, max_n_leaf_nodes: int, batch_size: int, pretrain_optimizer_params: dict,
               clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, grow_interval: int,
               pruning_threshold: float, optimizer_class: torch.optim.Optimizer,
               ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple,
               neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
               custom_dataloaders: tuple, augmentation_invariance: bool, device: torch.device,
-              random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module):
+              log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module):
     """
     Start the actual DeepECT clustering procedure on the input data set.
 
@@ -457,6 +468,8 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op
     ----------
     X : np.ndarray
         The given data set. Can be a np.ndarray or a torch.Tensor
+    val_set : np.ndarray
+        validation set (can be ignored)
     max_n_leaf_nodes : int
         Maximum number of leaf nodes in the cluster tree
     batch_size : int
@@ -497,6 +510,8 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op
         If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations
     device : torch.device
         The device on which to perform the computations
+    log_fn : Callable | None
+        function for logging training history values (e.g. loss values) during training
     random_state : np.random.RandomState
         use a fixed random state to get a repeatable solution
 
@@ -509,13 +524,13 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op
     """
     # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
     device, trainloader, testloader, _, neural_network, _, _, _, init_leafnode_centers, _ = get_default_deep_clustering_initialization(
-        X, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
+        X, val_set, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
         neural_network, embedding_size, custom_dataloaders, KMeans, {"n_init": 20}, device,
-        random_state, neural_network_weights=neural_network_weights)
+        random_state, log_fn=log_fn, neural_network_weights=neural_network_weights)
     cluster_tree = BinaryClusterTree(_DeepECT_ClusterTreeNode)
     # Setup DeepECT Module
     deepect_module = _DeepECT_Module(cluster_tree, max_n_leaf_nodes, grow_interval, pruning_threshold,
-                                     augmentation_invariance).to(device)
+                                     augmentation_invariance,log_fn).to(device)
     # Use DeepECT optimizer parameters (usually learning rate is reduced by a magnitude of 10)
     optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params)
     # DeepECT Training loop
@@ -626,7 +641,7 @@ def __init__(self, max_n_leaf_nodes: int = 20, batch_size: int = 256, pretrain_o
         self.custom_dataloaders = custom_dataloaders
         self.augmentation_invariance = augmentation_invariance
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT":
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> "DeepECT":
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -635,6 +650,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT":
         ----------
         X : np.ndarray
             the given data set
+        val_set : np.ndarray
+            validation set (can be ignored)
         y : np.ndarray
             the labels (can be ignored)
 
@@ -644,14 +661,14 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT":
             This instance of the DeepECT algorithm
         """
         X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, _ = self._check_parameters(X, y=y)
-        tree, labels, neural_network = _deep_ect(X, self.max_n_leaf_nodes, self.batch_size,
+        tree, labels, neural_network = _deep_ect(X, val_set, self.max_n_leaf_nodes, self.batch_size,
                                                  pretrain_optimizer_params, clustering_optimizer_params,
                                                  self.pretrain_epochs, self.clustering_epochs, self.grow_interval,
                                                  self.pruning_threshold, self.optimizer_class, self.ssl_loss_fn,
                                                  self.neural_network, self.neural_network_weights, self.embedding_size,
                                                  self.clustering_loss_weight, self.ssl_loss_weight,
                                                  self.custom_dataloaders, self.augmentation_invariance, self.device,
-                                                 random_state)
+                                                 self._log_history, random_state)
         self.tree_ = tree
         self.labels_ = labels
         self.neural_network_trained_ = neural_network
diff --git a/clustpy/deep/den.py b/clustpy/deep/den.py
index 7306175..80336b0 100644
--- a/clustpy/deep/den.py
+++ b/clustpy/deep/den.py
@@ -229,7 +229,7 @@ def _loss(self, batch: list, group_size: list, neural_network: torch.nn.Module,
         # Calculate group sparsity constraint
         group_sparsity_loss = self._group_sparsity_loss(embedded, group_size)
         loss = ssl_loss + self.weight_locality_constraint * locality_preserving_loss + self.weight_sparsity_constraint * group_sparsity_loss
-        return loss
+        return loss, ssl_loss, locality_preserving_loss, group_sparsity_loss
 
 
     def _get_nearest_neighbors(self, X: np.ndarray) -> list:
@@ -294,15 +294,25 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEN':
         for _ in tbar:
             # Update Network
             total_loss = 0
+            total_ssl_loss = 0
+            total_locality_loss = 0
+            total_sparsity_loss = 0
             for batch in trainloader:
                 loss = self._loss(batch, group_size, neural_network, device)
-                total_loss += loss.item()
+                total_loss += loss[0].item()
+                total_ssl_loss += loss[1].item()
+                total_locality_loss += loss[2].item()
+                total_sparsity_loss += loss[3].item()
                 # Backward pass - update weights
                 optimizer.zero_grad()
-                loss.backward()
+                loss[0].backward()
                 optimizer.step()
             postfix_str = {"Loss": total_loss}
             tbar.set_postfix(postfix_str)
+            self._log_history("Total Loss", total_loss)
+            self._log_history("SSL Loss", total_ssl_loss)
+            self._log_history("Locality Loss", total_locality_loss)
+            self._log_history("Sparsity Loss", total_sparsity_loss)
         # Execute clustering with Kmeans
         embedded_data = encode_batchwise(testloader, neural_network)
         kmeans = KMeans(n_clusters=self.n_clusters, random_state=random_state)
@@ -312,4 +322,5 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEN':
         self.cluster_centers_ = kmeans.cluster_centers_
         self.neural_network_trained_ = neural_network
         self.set_n_featrues_in(X)
+
         return self
diff --git a/clustpy/deep/dipdeck.py b/clustpy/deep/dipdeck.py
index 8433330..7d9065b 100644
--- a/clustpy/deep/dipdeck.py
+++ b/clustpy/deep/dipdeck.py
@@ -469,7 +469,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla
         loss = ssl_loss_weight * ssl_loss + clustering_loss_weight * cluster_loss
         return loss
 
-    def fit(self, X, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, 
+    def fit(self, X, val_set: np.ndarray | None, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, 
             testloader: torch.utils.data.DataLoader, n_epochs: int,
             device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
             clustering_loss_weight: float, ssl_loss_weight: float, debug: bool) -> '_DipDECK_Module':
diff --git a/clustpy/deep/dipencoder.py b/clustpy/deep/dipencoder.py
index bf6774e..4d82748 100644
--- a/clustpy/deep/dipencoder.py
+++ b/clustpy/deep/dipencoder.py
@@ -785,7 +785,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = None, pretrain_optimiz
         self.initial_clustering_class = initial_clustering_class
         self.initial_clustering_params = initial_clustering_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DipEncoder':
         """
         Initiate the actual clustering/dimensionality reduction process on the input data set.
         If no ground truth labels are given, the resulting cluster labels will be stored in the labels_ attribute.
@@ -794,6 +794,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder':
         ----------
         X : np.ndarray
             The given (training) data set
+        val_set : np.ndarray
+            The validation data set (not used in DipEncoder, included for compatibility reasons) (default: None)
         y : np.ndarray
             The ground truth labels. If None, the DipEncoder will be used for clustering (default: None)
 
@@ -807,7 +809,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder':
         batch_size = 25 * self.n_clusters if self.batch_size is None else self.batch_size
         # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
         device, trainloader, testloader, _, neural_network, X_embed, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization(
-            X, self.n_clusters, batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, self.ssl_loss_fn,
+            X, val_set, self.n_clusters, batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, self.ssl_loss_fn,
             self.neural_network, self.embedding_size, self.custom_dataloaders, self.initial_clustering_class if y is None else None, 
             initial_clustering_params, self.device, random_state, neural_network_weights=self.neural_network_weights)
         if y is not None:
diff --git a/clustpy/deep/dkm.py b/clustpy/deep/dkm.py
index 896a6f3..65426ad 100644
--- a/clustpy/deep/dkm.py
+++ b/clustpy/deep/dkm.py
@@ -14,13 +14,13 @@
 from collections.abc import Callable
 
 
-def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict,
+def _dkm(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
          neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int,
          clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple,
          augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
-         initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> (
+         initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState,log_fn: Callable | None) -> (
         np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
     """
     Start the actual DKM clustering procedure on the input data set.
@@ -29,6 +29,8 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int,
     ----------
     X : np.ndarray / torch.Tensor
         the given data set. Can be a np.ndarray or a torch.Tensor
+    val_set : np.ndarray
+        validation set (can be ignored)
     n_clusters : int
         number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
     alphas : list | tuple
@@ -88,11 +90,11 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int,
     """
     # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
     device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization(
-        X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
+        X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
         neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
-        random_state, neural_network_weights=neural_network_weights)
+        random_state, log_fn=log_fn, neural_network_weights=neural_network_weights)
     # Setup DKM Module
-    dkm_module = _DKM_Module(init_centers, alphas, augmentation_invariance).to(device)
+    dkm_module = _DKM_Module(init_centers, alphas, augmentation_invariance,log_fn).to(device)
     # Use DKM optimizer parameters (usually learning rate is reduced by a magnitude of 10)
     optimizer = optimizer_class(list(neural_network.parameters()) + list(dkm_module.parameters()),
                                 **clustering_optimizer_params)
@@ -183,10 +185,12 @@ class _DKM_Module(torch.nn.Module):
         Is augmentation invariance used
     """
 
-    def __init__(self, init_centers: np.ndarray, alphas: list, augmentation_invariance: bool = False):
+    def __init__(self, init_centers: np.ndarray, alphas: list,
+     augmentation_invariance: bool = False, log_fn: Callable | None = None):
         super().__init__()
         self.alphas = alphas
         self.augmentation_invariance = augmentation_invariance
+        self.log_fn = log_fn
         # Centers are learnable parameters
         self.centers = torch.nn.Parameter(torch.tensor(init_centers), requires_grad=True)
 
@@ -320,7 +324,7 @@ def _loss(self, batch: list, alpha: float, neural_network: torch.nn.Module, clus
             # Calculate clustering loss
             cluster_loss = self.dkm_loss(embedded, alpha)
         loss = ssl_loss_weight * ssl_loss + cluster_loss * clustering_loss_weight
-        return loss
+        return loss, ssl_loss, cluster_loss
 
     def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, n_epochs: int,
             device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
@@ -357,17 +361,27 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
         for alpha in self.alphas:
             for _ in range(n_epochs):
                 total_loss = 0
+                total_ssl_loss = 0
+                total_cluster_loss = 0
                 for batch in trainloader:
                     loss = self._loss(batch, alpha, neural_network, clustering_loss_weight, ssl_loss_weight,
                                       ssl_loss_fn, device)
-                    total_loss += loss.item()
+                    total_loss += loss[0].item()
+                    total_ssl_loss += loss[1].item()
+                    total_cluster_loss += loss[2].item()
                     # Backward pass
                     optimizer.zero_grad()
-                    loss.backward()
+                    loss[0].backward()
                     optimizer.step()
                 postfix_str = {"Loss": total_loss, "Alpha": alpha}
                 tbar.set_postfix(postfix_str)
                 tbar.update()
+            if self.log_fn is not None:
+                self.log_fn("Alpha completed", alpha)
+                self.log_fn("Total Loss", total_loss)
+                self.log_fn("SSL Loss", total_ssl_loss)
+                self.log_fn("Clustering Loss", total_cluster_loss)
+
         return self
 
 
@@ -506,7 +520,7 @@ def _check_alphas(self) -> list:
         assert type(alphas) is tuple or type(alphas) is list, "alphas must be a list, int or tuple"
         return alphas
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DKM':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -515,6 +529,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM':
         ----------
         X : np.ndarray
             the given data set
+        val_set : np.ndarray
+            validation set (can be ignored)
         y : np.ndarray
             the labels (can be ignored)
 
@@ -525,7 +541,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM':
         """
         X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y)
         alphas = self._check_alphas()
-        kmeans_labels, kmeans_centers, dkm_labels, dkm_centers, neural_network = _dkm(X, self.n_clusters, alphas,
+        kmeans_labels, kmeans_centers, dkm_labels, dkm_centers, neural_network = _dkm(X, val_set, self.n_clusters, alphas,
                                                                                       self.batch_size,
                                                                                       pretrain_optimizer_params,
                                                                                       clustering_optimizer_params,
@@ -543,7 +559,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM':
                                                                                       self.initial_clustering_class,
                                                                                       initial_clustering_params,
                                                                                       self.device,
-                                                                                      random_state)
+                                                                                      random_state,
+                                                                                      self._log_history)
         self.labels_ = kmeans_labels
         self.cluster_centers_ = kmeans_centers
         self.dkm_labels_ = dkm_labels
diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py
index f582157..86e661f 100644
--- a/clustpy/deep/neural_networks/_abstract_autoencoder.py
+++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py
@@ -118,6 +118,9 @@ def __init__(self, work_on_copy: bool = True, random_state: np.random.RandomStat
         self.random_state = random_state
         self.fitted = False
         self.allow_nd_input = False
+        rs = check_random_state(self.random_state)
+        set_torch_seed(rs)
+        print("Ich bin da _AbstractAutoencoder")
 
     def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -266,7 +269,7 @@ def evaluate(self, dataloader: torch.utils.data.DataLoader, ssl_loss_fn: Callabl
         """
         with torch.no_grad():
             self.eval()
-            loss = torch.tensor(0.)
+            loss = torch.tensor(0.0,device=device)
             for batch in dataloader:
                 new_loss, _, _ = self.loss(batch, ssl_loss_fn, device)
                 loss += new_loss
@@ -279,7 +282,7 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in
             optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
             ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, patience: int = 5,
             scheduler: torch.optim.lr_scheduler = None, scheduler_params: dict = {},
-            corruption_fn: Callable = None, model_path: str = None) -> '_AbstractAutoencoder':
+            corruption_fn: Callable = None, model_path: str = None,log_fn: Callable[[str, float], None] = None) -> '_AbstractAutoencoder':
         """
         Trains the autoencoder in place.
 
@@ -316,7 +319,8 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in
             For example, if the data is normalized, this may have to be taken into account in the corruption function - e.g. in case of salt and pepper noise (default: None)
         model_path : str
             if specified will save the trained model to the location. If evalloader is used, then only the best model w.r.t. evaluation loss is saved (default: None)
-
+        log_fn : Callable[[str, float], None]
+            function that takes a string and a float as input and logs the training process (default: None)
         Returns
         -------
         self : _AbstractAutoencoder
@@ -355,6 +359,7 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in
         # training loop
         device = get_device_from_module(self)
         tbar = tqdm.trange(n_epochs, desc="AE training")
+
         for epoch_i in tbar:
             self.train()
             total_loss = 0
@@ -382,8 +387,16 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in
                         self.save_parameters(model_path)
                 if early_stopping.early_stop:
                     print(f"Stop training at epoch {best_epoch}. Best Loss: {best_loss:.6f}, Last Loss: {val_loss:.6f}")
+                    break
                 if scheduler is not None and eval_step_scheduler:
                     scheduler.step(val_loss)
+                if log_fn is not None:
+                    log_fn("pretrain/Eval Loss", val_loss.item())
+            if log_fn is not None:
+                if evalloader is not None:
+                    log_fn("pretrain/Eval Loss", val_loss.item())
+                log_fn("pretrain/Train Loss", total_loss)
+
             tbar.set_postfix(postfix_str)
         # change to eval mode after training
         self.eval()
diff --git a/clustpy/deep/vade.py b/clustpy/deep/vade.py
index b5d6e79..06ccc23 100644
--- a/clustpy/deep/vade.py
+++ b/clustpy/deep/vade.py
@@ -17,13 +17,14 @@
 from collections.abc import Callable
 
 
-def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
+def _vade(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
           clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
           optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
           neural_network: torch.nn.Module | tuple, neural_network_weights: str,
           embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
           custom_dataloaders: tuple, initial_clustering_class: ClusterMixin, initial_clustering_params: dict,
-          device: torch.device, random_state: np.random.RandomState) -> (
+          device: torch.device, random_state: np.random.RandomState,
+          log_fn: Callable | None) -> (
         np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
     """
     Start the actual VaDE clustering procedure on the input data set.
@@ -32,6 +33,8 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa
     ----------
     X : np.ndarray / torch.Tensor
         the given data set. Can be a np.ndarray or a torch.Tensor
+    val_set : np.ndarray / torch.Tensor | None
+        validation set (can be ignored)
     n_clusters : int
         number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
     batch_size : int
@@ -86,15 +89,15 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa
     """
     # Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
     device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_means, init_clustering_algo = get_default_deep_clustering_initialization(
-        X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
+        X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
         neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
-        random_state, _VaDE_VAE, neural_network_weights=neural_network_weights)
+        random_state, _VaDE_VAE, log_fn=log_fn, neural_network_weights=neural_network_weights)
     # Get parameters from initial clustering algorithm
     init_weights = None if not hasattr(init_clustering_algo, "weights_") else init_clustering_algo.weights_
     init_covs = None if not hasattr(init_clustering_algo, "covariances_") else init_clustering_algo.covariances_
     # Initialize VaDE
     vade_module = _VaDE_Module(n_clusters=n_clusters, embedding_size=embedding_size, weights=init_weights,
-                               means=init_means, variances=init_covs).to(device)
+                               means=init_means, variances=init_covs,log_fn=log_fn).to(device)
     # Use vade learning_rate (usually pretrain_optimizer_params reduced by a magnitude of 10)
     optimizer = optimizer_class(list(neural_network.parameters()) + list(vade_module.parameters()),
                                 **clustering_optimizer_params)
@@ -238,7 +241,7 @@ class _VaDE_Module(torch.nn.Module):
     """
 
     def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor = None, means: torch.Tensor = None,
-                 variances: torch.Tensor = None):
+                 variances: torch.Tensor = None, log_fn: Callable | None = None):
         super(_VaDE_Module, self).__init__()
         if weights is None:
             # if not initialized then use uniform distribution
@@ -254,7 +257,7 @@ def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor =
                                    embedding_size), "Shape of the initial variances for the Vade_Module must be (n_clusters, embedding_size)"
         self.p_log_var = torch.nn.Parameter(torch.log(torch.tensor(variances)), requires_grad=True)
         self.normalize_prob = torch.nn.Softmax(dim=0)
-
+        self.log_fn = log_fn
     def predict(self, q_mean: torch.Tensor, q_logvar: torch.Tensor) -> torch.Tensor:
         """
         Predict the labels given the specific means and variances of given samples.
@@ -305,9 +308,9 @@ def vade_loss(self, neural_network: VariationalAutoencoder, batch_data: torch.Te
         z, q_mean, q_logvar, reconstruction = neural_network.forward(batch_data)
         pi_normalized = self.normalize_prob(self.pi)
         p_c_z = _get_gamma(pi_normalized, self.p_mean, self.p_log_var, z)
-        loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z,
+        loss, ssl_loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z,
                                   reconstruction, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight)
-        return loss
+        return loss, ssl_loss
 
     def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.data.DataLoader,
             trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device,
@@ -348,16 +351,21 @@ def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.da
         for _ in tbar:
             self.train()
             total_loss = 0
+            total_ssl_loss = 0
             for batch in trainloader:
                 # load batch on device
                 batch_data = batch[1].to(device)
                 loss = self.vade_loss(neural_network, batch_data, ssl_loss_fn, clustering_loss_weight,
                                       ssl_loss_weight)
-                total_loss += loss.item()
+                total_loss += loss[0].item()
+                total_ssl_loss += loss[1].item()
                 optimizer.zero_grad()
-                loss.backward()
+                loss[0].backward()
                 optimizer.step()
             postfix_str = {"Loss": total_loss}
+            if self.log_fn is not None:
+                self.log_fn("Total Loss", total_loss)
+                self.log_fn("SSL Loss", total_ssl_loss)
             tbar.set_postfix(postfix_str)
         return self
 
@@ -478,7 +486,7 @@ def _compute_vade_loss(pi: torch.Tensor, p_mean: torch.Tensor, p_log_var: torch.
     loss = p_z_c - p_c - q_z_x + q_c_x
     loss /= batch_data.size(0)
     loss = clustering_loss_weight * loss + ssl_loss_weight * p_x_z  # Beware that we do not divide two times by number of samples
-    return loss
+    return loss, p_x_z
 
 
 class VaDE(_AbstractDeepClusteringAlgo):
@@ -589,7 +597,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
         self.initial_clustering_class = initial_clustering_class
         self.initial_clustering_params = initial_clustering_params
 
-    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE':
+    def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'VaDE':
         """
         Initiate the actual clustering process on the input data set.
         The resulting cluster labels will be stored in the labels_ attribute.
@@ -598,6 +606,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE':
         ----------
         X : np.ndarray
             the given data set
+        val_set : np.ndarray
+            validation set (can be ignored)
         y : np.ndarray
             the labels (can be ignored)
 
@@ -613,6 +623,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE':
                                           "covariance_type": "diag"} if self.initial_clustering_params is None else self.initial_clustering_params
         gmm_labels, gmm_means, gmm_covariances, gmm_weights, vade_labels, vade_centers, vade_covariances, neural_network = _vade(
             X,
+            val_set,
             self.n_clusters,
             self.batch_size,
             pretrain_optimizer_params,
@@ -630,7 +641,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE':
             self.initial_clustering_class,
             initial_clustering_params,
             self.device,
-            random_state)
+            random_state,
+            self._log_history)
         self.labels_ = gmm_labels
         self.cluster_centers_ = gmm_means
         self.covariances_ = gmm_covariances