From 7b60bb8c463abdb8dc7c2e6290c3fab71cb4127a Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 28 Jan 2021 09:40:47 -0800 Subject: [PATCH 1/6] Add cosine distance as valid metric Allows cosine distance to be set at the metric. [scipy.spatial.distance.cosine](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html#scipy.spatial.distance.cosine) returns `1 - cosine simillarity` which is [equivalent to angular distance](https://en.wikipedia.org/wiki/Cosine_similarity#Angular_distance_and_similarity), and thus is the same thing as setting `angular` as the metric ``` ValueError: Unknown metric angular. Valid metrics are ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', 'wminkowski', 'nan_euclidean', 'haversine'], or 'precomputed', or a callable ```
Full error message ``` --------------------------------------------------------------------------- ValueError Traceback (most recent call last) in 1 sc.external.pp.bbknn(preprocessed, batch_key='species_batch', n_pcs=15, metric='cosine') 2 ----> 3 sc.tl.umap(preprocessed) 4 sc.pl.umap(preprocessed, **umap_plot_kws) ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/scanpy/tools/_umap.py in umap(adata, min_dist, spread, n_components, maxiter, alpha, gamma, negative_sample_rate, init_pos, random_state, a, b, copy, method, neighbors_key) 171 neigh_params.get('metric', 'euclidean'), 172 neigh_params.get('metric_kwds', {}), --> 173 verbose=settings.verbosity > 3, 174 ) 175 elif method == 'rapids': ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/umap/umap_.py in simplicial_set_embedding(data, graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, output_metric, output_metric_kwds, euclidean_output, parallel, verbose) 1037 random_state, 1038 metric=metric, -> 1039 metric_kwds=metric_kwds, 1040 ) 1041 expansion = 10.0 / np.abs(initialisation).max() ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/umap/spectral.py in spectral_layout(data, graph, dim, random_state, metric, metric_kwds) 304 random_state, 305 metric=metric, --> 306 metric_kwds=metric_kwds, 307 ) 308 ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/umap/spectral.py in multi_component_layout(data, graph, n_components, component_labels, dim, random_state, metric, metric_kwds) 191 random_state, 192 metric=metric, --> 193 metric_kwds=metric_kwds, 194 ) 195 else: ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/umap/spectral.py in component_layout(data, n_components, component_labels, dim, random_state, metric, metric_kwds) 120 else: 121 distance_matrix = pairwise_distances( --> 122 component_centroids, metric=metric, **metric_kwds 123 ) 124 ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 70 FutureWarning) 71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 72 return f(**kwargs) 73 return inner_f 74 ~/miniconda3/envs/tabula-microcebus-jan2021/lib/python3.7/site-packages/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, force_all_finite, **kwds) 1738 raise ValueError("Unknown metric %s. " 1739 "Valid metrics are %s, or 'precomputed', or a " -> 1740 "callable" % (metric, _VALID_METRICS)) 1741 1742 if metric == "precomputed": ValueError: Unknown metric angular. Valid metrics are ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', 'wminkowski', 'nan_euclidean', 'haversine'], or 'precomputed', or a callable ```
--- bbknn/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bbknn/__init__.py b/bbknn/__init__.py index ed48829..83f90ce 100644 --- a/bbknn/__init__.py +++ b/bbknn/__init__.py @@ -280,9 +280,9 @@ def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='angula if use_rep not in adata.obsm.keys(): raise ValueError("Did not find "+use_rep+" in `.obsm.keys()`. You need to compute it first.") #metric sanity checks - if approx and metric not in ['angular', 'euclidean', 'manhattan', 'hamming']: - logg.warning('unrecognised metric for type of neighbor calculation, switching to angular') - metric = 'angular' + if approx and metric not in ['angular', 'euclidean', 'manhattan', 'hamming', 'cosine']: + logg.warning('unrecognised metric for type of neighbor calculation, switching to cosine (') + metric = 'cosine' elif not approx and not (metric=='euclidean' or isinstance(metric,DistanceMetric) or metric in KDTree.valid_metrics): logg.warning('unrecognised metric for type of neighbor calculation, switching to euclidean') metric = 'euclidean' From 7890bcc52261d56837c89e16c17f197583762141 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 28 Jan 2021 09:53:27 -0800 Subject: [PATCH 2/6] Change default metric to "cosine" --- bbknn/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bbknn/__init__.py b/bbknn/__init__.py index 83f90ce..ddbcd9f 100644 --- a/bbknn/__init__.py +++ b/bbknn/__init__.py @@ -211,7 +211,7 @@ def trimming(cnts,trim): cnts = cnts.T.tocsr() return cnts -def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='angular', copy=False, **kwargs): +def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='cosine', copy=False, **kwargs): ''' Batch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in each batch separately instead of the entire cell pool with no accounting for batch. @@ -248,8 +248,8 @@ def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='angula If ``approx=False`` and the metric is "euclidean", use the faiss package to compute nearest neighbours if installed. This improves performance at a minor cost to numerical precision as faiss operates on float32. - metric : ``str`` or ``sklearn.neighbors.DistanceMetric``, optional (default: "angular") - What distance metric to use. If using ``approx=True``, the options are "angular", + metric : ``str`` or ``sklearn.neighbors.DistanceMetric``, optional (default: "cosine") + What distance metric to use. If using ``approx=True``, the options are "cosine", "euclidean", "manhattan" and "hamming". Otherwise, the options are "euclidean", a member of the ``sklearn.neighbors.KDTree.valid_metrics`` list, or parameterised ``sklearn.neighbors.DistanceMetric`` `objects @@ -280,7 +280,7 @@ def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='angula if use_rep not in adata.obsm.keys(): raise ValueError("Did not find "+use_rep+" in `.obsm.keys()`. You need to compute it first.") #metric sanity checks - if approx and metric not in ['angular', 'euclidean', 'manhattan', 'hamming', 'cosine']: + if approx and metric not in ['euclidean', 'manhattan', 'hamming', 'cosine']: logg.warning('unrecognised metric for type of neighbor calculation, switching to cosine (') metric = 'cosine' elif not approx and not (metric=='euclidean' or isinstance(metric,DistanceMetric) or metric in KDTree.valid_metrics): @@ -317,7 +317,7 @@ def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='angula return adata if copy else None def bbknn_pca_matrix(pca, batch_list, neighbors_within_batch=3, n_pcs=50, trim=None, - approx=True, n_trees=10, use_faiss=True, metric='angular', + approx=True, n_trees=10, use_faiss=True, metric='cosine', set_op_mix_ratio=1, local_connectivity=1): ''' Scanpy-independent BBKNN variant that runs on a PCA matrix and list of per-cell batch assignments instead of From 2c00555f8a60c64b7df39f64c4c0b03dadbec659 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 29 Jan 2021 20:07:11 -0800 Subject: [PATCH 3/6] Update metric within bbknn_pca_matrix --- bbknn/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bbknn/__init__.py b/bbknn/__init__.py index ddbcd9f..9361dfa 100644 --- a/bbknn/__init__.py +++ b/bbknn/__init__.py @@ -71,7 +71,8 @@ def compute_connectivities_umap(knn_indices, knn_dists, if isinstance(connectivities, tuple): # In umap-learn 0.4, this returns (result, sigmas, rhos) connectivities = connectivities[0] - distances = get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obs, n_neighbors) + distances = get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obsunrecognised metric for type of neighbor calculation, switching to angular +, n_neighbors) return distances, connectivities.tocsr() @@ -343,9 +344,9 @@ def bbknn_pca_matrix(pca, batch_list, neighbors_within_batch=3, n_pcs=50, trim=N if np.min(counts) < neighbors_within_batch: raise ValueError("Not all batches have at least `neighbors_within_batch` cells in them.") #metric sanity checks (duplicating the ones in bbknn(), but without scanpy logging) - if approx and metric not in ['angular', 'euclidean', 'manhattan', 'hamming']: - print('unrecognised metric for type of neighbor calculation, switching to angular') - metric = 'angular' + if approx and metric not in ['cosine', 'euclidean', 'manhattan', 'hamming']: + print('unrecognised metric for type of neighbor calculation, switching to cosine') + metric = 'cosine' elif not approx and not (metric=='euclidean' or isinstance(metric,DistanceMetric) or metric in KDTree.valid_metrics): print('unrecognised metric for type of neighbor calculation, switching to euclidean') metric = 'euclidean' From 73a487a68fc536f018aec65b6b23a78e0024ec55 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 29 Jan 2021 20:15:33 -0800 Subject: [PATCH 4/6] Update bbknn/__init__.py --- bbknn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbknn/__init__.py b/bbknn/__init__.py index 9361dfa..548f158 100644 --- a/bbknn/__init__.py +++ b/bbknn/__init__.py @@ -71,7 +71,7 @@ def compute_connectivities_umap(knn_indices, knn_dists, if isinstance(connectivities, tuple): # In umap-learn 0.4, this returns (result, sigmas, rhos) connectivities = connectivities[0] - distances = get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obsunrecognised metric for type of neighbor calculation, switching to angular + distances = get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obs, n_neighbors) , n_neighbors) return distances, connectivities.tocsr() From 810701024b5a6f59b70604edad22c3772d1f52e7 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 29 Jan 2021 20:15:56 -0800 Subject: [PATCH 5/6] Update bbknn/__init__.py --- bbknn/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bbknn/__init__.py b/bbknn/__init__.py index 548f158..290c04b 100644 --- a/bbknn/__init__.py +++ b/bbknn/__init__.py @@ -72,7 +72,6 @@ def compute_connectivities_umap(knn_indices, knn_dists, # In umap-learn 0.4, this returns (result, sigmas, rhos) connectivities = connectivities[0] distances = get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obs, n_neighbors) -, n_neighbors) return distances, connectivities.tocsr() From f54b4b8b48ea8438d7741d7316f0642e04165ba1 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 29 Jan 2021 20:19:44 -0800 Subject: [PATCH 6/6] set metric to "angular" for AnnoyIndex internally --- bbknn/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bbknn/__init__.py b/bbknn/__init__.py index 290c04b..75002eb 100644 --- a/bbknn/__init__.py +++ b/bbknn/__init__.py @@ -86,7 +86,11 @@ def create_tree(data,approx,metric,use_faiss,n_trees): PCA coordinates of a batch's cells to index. ''' if approx: - ckd = AnnoyIndex(data.shape[1],metric=metric) + if metric == 'cosine': + annoy_metric = 'angular' + else: + annoy_metric = metric + ckd = AnnoyIndex(data.shape[1],metric=annoy_metric) for i in np.arange(data.shape[0]): ckd.add_item(i,data[i,:]) ckd.build(n_trees) @@ -213,7 +217,7 @@ def trimming(cnts,trim): def bbknn(adata, batch_key='batch', use_rep='X_pca', approx=True, metric='cosine', copy=False, **kwargs): ''' - Batch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in + Badistances = get_sparse_matrix_from_indices_distances_umap(knn_indices, knn_dists, n_obs, n_neighbors)tch balanced KNN, altering the KNN procedure to identify each cell's top neighbours in each batch separately instead of the entire cell pool with no accounting for batch. Aligns batches in a quick and lightweight manner. For use in the scanpy workflow as an alternative to ``scanpi.api.pp.neighbors()``.