theislab · merelkuijs · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.aider*
diff --git a/src/graphcompass/__init__.py b/src/graphcompass/__init__.py
@@ -1,4 +1,4 @@
-"""Graph-COMPASS."""
+"""GraphCompass"""
 from graphcompass import pl
 from graphcompass import tl
 from graphcompass import datasets

diff --git a/src/graphcompass/__main__.py b/src/graphcompass/__main__.py
@@ -5,7 +5,7 @@
 @click.command()
 @click.version_option()
 def main() -> None:
-    """Graph-COMPASS."""
+    """GraphCompass"""
 
 
 if __name__ == "__main__":

diff --git a/src/graphcompass/imports/wwl_package/propagation_scheme.py b/src/graphcompass/imports/wwl_package/propagation_scheme.py
@@ -1,4 +1,4 @@
-######## This file is copied from https://github.com/BorgwardtLab/WWL/blob/master/src/wwl/propagation_scheme.py ########
+######## This file is adapted from https://github.com/BorgwardtLab/WWL/blob/master/src/wwl/propagation_scheme.py ########
 
 # -----------------------------------------------------------------------------
 # This file contains the propagation schemes for categorically labeled and 
@@ -19,7 +19,7 @@
 from collections import defaultdict
 from typing import List
 from tqdm import tqdm
-from scipy.sparse import csr_matrix, diags
+from scipy.sparse import csr_matrix, diags, eye
 
 
 ####################
@@ -149,8 +149,14 @@ def _preprocess_graphs(self, X: List[ig.Graph]):
 
         # Iterate across graphs and load initial node features
         for graph in X:
-            if not 'label' in graph.vs.attribute_names():
-                graph.vs['label'] = list(map(str, [l for l in graph.vs.degree()]))    
+            if 'label' in graph.vs.attribute_names():                                                                                                                                 
+                labels = graph.vs['label']                                                                                                                                            
+                if not all(isinstance(label, (int, float)) for label in labels):
+                    logging.warning("Non-numeric labels found. Falling back to node degrees.")
+                    graph.vs['label'] = list(graph.vs.degree())
+            else:
+                graph.vs['label'] = list(graph.vs.degree())
+
             # Get features and adjacency matrix
             node_features_cur = np.asarray(graph.vs['label']).astype(float).reshape(-1, 1)
             adj_mat_cur = csr_matrix(graph.get_adjacency_sparse())
@@ -212,12 +218,12 @@ def fit_transform(self, X: List[ig.Graph], node_features = None, num_iterations:
                 if it == 0:
                     graph_feat.append(node_features[i])
                 else:
-                    adj_cur = adj_mat[i] + csr_matrix(np.identity(adj_mat[i].shape[0]))
+                    adj_cur = adj_mat[i] + eye(adj_mat[i].shape[0], format='csr')
                     adj_cur = self._create_adj_avg(adj_cur)
 
                     adj_cur.setdiag(0)
                     graph_feat_cur = 0.5 * (adj_cur @ graph_feat[it-1] + graph_feat[it-1])
                     graph_feat.append(graph_feat_cur)
 
             self._label_sequences.append(np.concatenate(graph_feat, axis=1))
-        return self._label_sequences
+        return self._label_sequences
diff --git a/src/graphcompass/imports/wwl_package/wwl.py b/src/graphcompass/imports/wwl_package/wwl.py
@@ -1,90 +1,121 @@
-######## This file is copied from https://github.com/BorgwardtLab/WWL/blob/master/src/wwl/wwl.py ########
+"""
+Wasserstein Weisfeiler-Lehman (WWL) kernel implementation.
 
+This module provides tools for computing graph similarities using the Wasserstein 
+Weisfeiler-Lehman kernel, supporting both categorical and continuous graph embeddings.
+
+Adapted from: https://github.com/BorgwardtLab/WWL/blob/master/src/wwl/wwl.py
+"""
 
-# -----------------------------------------------------------------------------
-# This file contains the API for the WWL kernel computations
-#
-# December 2019, M. Togninalli
-# -----------------------------------------------------------------------------
 import sys
 import logging
-from tqdm import tqdm
 
-import ot
 import numpy as np
+import torch
+from geomloss import SamplesLoss
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.metrics.pairwise import laplacian_kernel
 
 from .propagation_scheme import WeisfeilerLehman, ContinuousWeisfeilerLehman
 
 logging.basicConfig(level=logging.INFO)
 
 def logging_config(level='DEBUG'):
-    level = logging.getLevelName(level.upper())
-    logging.basicConfig(level=level)
-    pass
-
-def _compute_wasserstein_distance(label_sequences, sinkhorn=False, 
-                                    categorical=False, sinkhorn_lambda=1e-2):
-    '''
-    Generate the Wasserstein distance matrix for the graphs embedded 
-    in label_sequences
-    '''
-    # Get the iteration number from the embedding file
-    n = len(label_sequences)
-
-    M = np.zeros((n,n))
-    # Iterate over pairs of graphs
-    for graph_index_1, graph_1 in enumerate(label_sequences):
-        # Only keep the embeddings for the first h iterations
-        labels_1 = label_sequences[graph_index_1]
-        for graph_index_2, graph_2 in tqdm(enumerate(label_sequences[graph_index_1:])):
-            labels_2 = label_sequences[graph_index_2 + graph_index_1]
-            # Get cost matrix
-            ground_distance = 'hamming' if categorical else 'euclidean'
-            costs = ot.dist(labels_1, labels_2, metric=ground_distance)
-
-            if sinkhorn:
-                mat = ot.sinkhorn(
-                    np.ones(len(labels_1))/len(labels_1),
-                    np.ones(len(labels_2))/len(labels_2),
-                    costs,
-                    sinkhorn_lambda,
-                    numItermax=50
-                )
-                M[graph_index_1, graph_index_2 + graph_index_1] = np.sum(np.multiply(mat, costs))
-            else:
-                M[graph_index_1, graph_index_2 + graph_index_1] = \
-                    ot.emd2([], [], costs)
-
-    M = (M + M.T)
-    return M
-
-def pairwise_wasserstein_distance(X, node_features = None, num_iterations=3, sinkhorn=False, enforce_continuous=False):
+    """Set the logging level for the application.
+
+    Configures the global logging level to control the verbosity of log messages.
+
+    Args:
+        level (str, optional): Logging level. Defaults to 'DEBUG'.
+            Typical values include 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'.
     """
-    Pairwise computation of the Wasserstein distance between embeddings of the 
-    graphs in X.
-    args:
-        X (List[ig.graphs]): List of graphs
-        node_features (array): Array containing the node features for continuously attributed graphs
-        num_iterations (int): Number of iterations for the propagation scheme
-        sinkhorn (bool): Indicates whether sinkhorn approximation should be used
+    logging.basicConfig(level=logging.getLevelName(level.upper()))
+
+def _compute_wasserstein_distance_geomloss(label_sequences, categorical=False, blur=0.05, p=2):
+    """Compute pairwise Wasserstein distances between graph node embeddings.
+
+    Calculates the optimal transport distance between node embeddings using 
+    the Sinkhorn algorithm. Automatically uses GPU acceleration if available.
+
+    Args:
+        label_sequences (list): List of node embeddings for each graph
+        categorical (bool): Whether the node labels are categorical (discrete) or continuous embeddings.
+        blur (float, optional): Sinkhorn smoothing parameter. Defaults to 0.05.
+        p (int, optional): Power of the cost function. Defaults to 2 (squared Euclidean).
+
+    Returns:
+        numpy.ndarray: Symmetric matrix of pairwise Wasserstein distances
+
+    Notes:
+        This function differs from the legacy implementation in that it leverages
+        the GeomLoss library for GPU-accelerated Sinkhorn computations, enabling 
+        efficient optimal transport calculations even on large graphs. The legacy
+        function uses the POT library and runs on CPU only, which can be slower 
+        for large datasets. Additionally, this function handles both categorical
+        and continuous node features in a unified manner via one-hot encoding 
+        for discrete labels.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    sinkhorn = SamplesLoss("sinkhorn", p=p, blur=blur)
+
+    n = len(label_sequences)
+    M = torch.zeros((n, n), device=device)
+
+    if categorical:
+        # Flatten all labels for one-hot encoder fitting
+        all_labels = np.concatenate(label_sequences).reshape(-1, 1)
+        enc = OneHotEncoder(sparse_output=False, dtype=np.float32)
+        enc.fit(all_labels)
+
+        # Encode all graphs now for speed
+        encoded_sequences = [torch.tensor(enc.transform(seq.reshape(-1,1)), device=device) for seq in label_sequences]
+
+    else:
+        # Assume label_sequences are arrays of shape (n_nodes, features)
+        encoded_sequences = [torch.tensor(seq, dtype=torch.float32, device=device) for seq in label_sequences]
+
+    for i in range(n):
+        a = torch.ones(encoded_sequences[i].shape[0], device=device) / encoded_sequences[i].shape[0]
+        for j in range(i, n):
+            b = torch.ones(encoded_sequences[j].shape[0], device=device) / encoded_sequences[j].shape[0]
+
+            dist = sinkhorn(a, encoded_sequences[i], b, encoded_sequences[j])
+            M[i, j] = dist
+            M[j, i] = dist  # symmetric
+
+    return M.cpu().numpy()
+
+def pairwise_wasserstein_distance(X, node_features=None, num_iterations=3, enforce_continuous=False):
+    """Compute pairwise Wasserstein distances between graph embeddings.
+
+    Determines the appropriate embedding scheme (categorical or continuous) 
+    and computes the Wasserstein distances between graph representations.
+
+    Args:
+        X (list): List of graphs to compare
+        node_features (array-like, optional): Pre-computed node features for continuous graphs
+        num_iterations (int, optional): Number of iterations for graph embedding. Defaults to 3.
+        enforce_continuous (bool, optional): Force use of continuous embedding scheme. Defaults to False.
+
+    Returns:
+        numpy.ndarray: Matrix of pairwise Wasserstein distances between graphs
     """
     # First check if the graphs are continuous vs categorical
     categorical = True
     if enforce_continuous:
-        logging.info('Enforce continous flag is on, using CONTINUOUS propagation scheme.')
+        logging.info('Continuous embedding enforced: Using continuous propagation scheme.')
         categorical = False
     elif node_features is not None:
-        logging.info('Continuous node features provided, using CONTINUOUS propagation scheme.')
+        logging.info('Continuous node features detected: Using continuous propagation scheme.')
         categorical = False
     else:
         for g in X:
-            if not 'label' in g.vs.attribute_names():
-                logging.info('No label attributed to graphs, use degree instead and use CONTINUOUS propagation scheme.')
+            if 'label' not in g.vs.attribute_names() or not all(isinstance(label, (int, float)) for label in g.vs['label']):
+                logging.info('Invalid categorical labels found: Switching to continuous propagation scheme using node degrees.')
                 categorical = False
                 break
         if categorical:
-            logging.info('Categorically-labelled graphs, using CATEGORICAL propagation scheme.')
+            logging.info('Valid categorical graph labels detected: Using categorical propagation scheme.')
 
     # Embed the nodes
     if categorical:
@@ -95,21 +126,26 @@ def pairwise_wasserstein_distance(X, node_features = None, num_iterations=3, sin
         node_representations = es.fit_transform(X, node_features=node_features, num_iterations=num_iterations)
 
     # Compute the Wasserstein distance
-    print("Computing Wasserstein distance between conditions...")
-    pairwise_distances = _compute_wasserstein_distance(node_representations, sinkhorn=sinkhorn, 
-                                    categorical=categorical, sinkhorn_lambda=1e-2)
+    logging.info("Computing pairwise Wasserstein distances between graph embeddings...")
+    pairwise_distances = _compute_wasserstein_distance_geomloss(node_representations, categorical=categorical)
     return pairwise_distances
 
-def wwl(X, node_features=None, num_iterations=3, sinkhorn=False, gamma=None):
-    """
-    Pairwise computation of the Wasserstein Weisfeiler-Lehman kernel for graphs in X.
+def wwl(X, node_features=None, num_iterations=3, gamma=None):
+    """Compute the Wasserstein Weisfeiler-Lehman (WWL) kernel for a set of graphs.
+
+    Combines Wasserstein distance computation with a Laplacian kernel to 
+    measure graph similarities.
+
+    Args:
+        X (list): List of graphs to compare
+        node_features (array-like, optional): Pre-computed node features for continuous graphs
+        num_iterations (int, optional): Number of iterations for graph embedding. Defaults to 3.
+        gamma (float, optional): Scaling parameter for the Laplacian kernel. Defaults to None.
+
+    Returns:
+        numpy.ndarray: Kernel matrix representing graph similarities
     """
     D_W =  pairwise_wasserstein_distance(X, node_features = node_features, 
-                                num_iterations=num_iterations, sinkhorn=sinkhorn)
+                                num_iterations=num_iterations)
     wwl = laplacian_kernel(D_W, gamma=gamma)
     return wwl
-
-
-#######################
-# Class implementation
-#######################
-Original file line number
+Diff line change
@@ Expand Up / @@ -161,3 +161,4 @@ cython_debug/ @@
     #  and can be added to the global gitignore or merged into this file.  For a more nuclear
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
+    .aider*