From 36b881b5be84cfd8872adaad4f92a783e16e7fe4 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 1/9] add codes

---
 xenonpy/descriptor/__init__.py     |   1 +
 xenonpy/descriptor/compositions.py | 115 +++++++++++++++++++----------
 xenonpy/descriptor/kernel.py       |  27 +++++++
 3 files changed, 106 insertions(+), 37 deletions(-)
 create mode 100644 xenonpy/descriptor/kernel.py

diff --git a/xenonpy/descriptor/__init__.py b/xenonpy/descriptor/__init__.py
index de156fe..28d7fb4 100644
--- a/xenonpy/descriptor/__init__.py
+++ b/xenonpy/descriptor/__init__.py
@@ -7,3 +7,4 @@
 from .fingerprint import *
 from .frozen_featurizer import *
 from .structure import *
+from .kernel import *
diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index 43c61c1..bfc4264 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -2,28 +2,86 @@
 #  Use of this source code is governed by a BSD-style
 #  license that can be found in the LICENSE file.
 
-from typing import Union, List
+import itertools
+from typing import Callable, List, Sequence, Union
 
 import numpy as np
 import pandas as pd
-
-from xenonpy.descriptor.base import BaseDescriptor, BaseCompositionFeaturizer
+from pymatgen.core import Composition as PMGComp
+from sklearn.preprocessing import MinMaxScaler
+from xenonpy.datatools import preset
+from xenonpy.descriptor.base import (BaseCompositionFeaturizer, BaseDescriptor, BaseFeaturizer)
 
 __all__ = [
-    'Compositions', 'Counting', 'WeightedAverage', 'WeightedSum', 'WeightedVariance',
-    'HarmonicMean', 'GeometricMean', 'MaxPooling', 'MinPooling'
+    'Compositions',
+    'Counting',
+    'WeightedAverage',
+    'WeightedSum',
+    'WeightedVariance',
+    'HarmonicMean',
+    'GeometricMean',
+    'MaxPooling',
+    'MinPooling',
+    'KernelMean',
 ]
 
 
+class KernelMean(BaseFeaturizer):
+
+    def __init__(
+        self,
+        kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]],
+        *,
+        feature_matrix: Union[None, pd.DataFrame] = None,
+        grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
+    ):
+
+        if feature_matrix is None:  # use elemental info
+            feature_matrix = preset.elements_completed
+
+        # re-scale to [0, 1]
+        scaled_matrix = MinMaxScaler().fit_transform(feature_matrix)
+
+        # calculate centers for each feature
+        if grid is None:
+            grid = scaled_matrix.values.mean(axis=0).reshape(-1, 1)  # use mean of feature as center
+        elif isinstance(grid, int):
+            grid = np.array([np.linspace(0, 1, grid)] * scaled_matrix.shape[1])  # create bins
+        elif isinstance(grid, Sequence):
+            grid = np.asarray(grid)
+            if grid.ndim == 1:
+                if grid.size != scaled_matrix.shape[1]:
+                    raise ValueError(
+                        f'length of grid ({grid.size}) must be equal to feature size ({scaled_matrix.shape[1]})')
+                grid = np.array([np.linspace(0, 1, grid) for i in grid])
+            elif grid.ndim == 2:
+                pass  # direct input
+            else:
+                raise ValueError('dim of grid must be 1 or 2')
+
+        # calculate kernel matrix for featrues
+        kernel_matrix = kernel_func(scaled_matrix, grid)
+
+        # generate column names of output
+        labels = itertools.chain(
+            *[[f'{n}_k{k+1}' for k in range(g.size)] for n, g in zip(feature_matrix.columns, grid)])
+
+        self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels)
+
+    def featurize(self, comp):
+        if isinstance(comp, PMGComp):
+            comp = comp.as_dict()
+
+        return sum([self._kernel_matrix.loc[e].values for e, n in comp.items()])
+
+    @property
+    def feature_labels(self):
+        return self._labels
+
+
 class Counting(BaseCompositionFeaturizer):
 
-    def __init__(self,
-                 *,
-                 one_hot_vec=False,
-                 n_jobs=-1,
-                 on_errors='raise',
-                 return_type='any',
-                 target_col=None):
+    def __init__(self, *, one_hot_vec=False, n_jobs=-1, on_errors='raise', return_type='any', target_col=None):
         """
 
         Parameters
@@ -53,10 +111,7 @@ def __init__(self,
             Default is None.
         """
 
-        super().__init__(n_jobs=n_jobs,
-                         on_errors=on_errors,
-                         return_type=return_type,
-                         target_col=target_col)
+        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
         self.one_hot_vec = one_hot_vec
         self._elems = self.elements.index.tolist()
         self.__authors__ = ['TsumiNa']
@@ -410,24 +465,10 @@ def __init__(self,
             super().__init__(featurizers=featurizers)
 
         self.composition = Counting(n_jobs=n_jobs, on_errors=on_errors)
-        self.composition = WeightedAverage(n_jobs=n_jobs,
-                                           on_errors=on_errors,
-                                           elemental_info=elemental_info)
-        self.composition = WeightedSum(n_jobs=n_jobs,
-                                       on_errors=on_errors,
-                                       elemental_info=elemental_info)
-        self.composition = WeightedVariance(n_jobs=n_jobs,
-                                            on_errors=on_errors,
-                                            elemental_info=elemental_info)
-        self.composition = GeometricMean(n_jobs=n_jobs,
-                                         on_errors=on_errors,
-                                         elemental_info=elemental_info)
-        self.composition = HarmonicMean(n_jobs=n_jobs,
-                                        on_errors=on_errors,
-                                        elemental_info=elemental_info)
-        self.composition = MaxPooling(n_jobs=n_jobs,
-                                      on_errors=on_errors,
-                                      elemental_info=elemental_info)
-        self.composition = MinPooling(n_jobs=n_jobs,
-                                      on_errors=on_errors,
-                                      elemental_info=elemental_info)
+        self.composition = WeightedAverage(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
+        self.composition = WeightedSum(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
+        self.composition = WeightedVariance(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
+        self.composition = GeometricMean(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
+        self.composition = HarmonicMean(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
+        self.composition = MaxPooling(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
+        self.composition = MinPooling(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info)
diff --git a/xenonpy/descriptor/kernel.py b/xenonpy/descriptor/kernel.py
new file mode 100644
index 0000000..ed085b4
--- /dev/null
+++ b/xenonpy/descriptor/kernel.py
@@ -0,0 +1,27 @@
+# Copyright 2021 TsumiNa
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+__all__ = ['RBFKernel']
+
+
+class RBFKernel():
+
+    def __init__(self, sigma):
+        self._sigma = sigma
+
+    def __call__(self, x_i: np.ndarray, x_j: np.ndarray):
+        # K(x_i, x_j) = exp(-||x_i - x_j||^2 / (2 * sigma^2))
+        return np.exp(-(x_i[:, :, np.newaxis] - x_j).reshape(x_i.shape[0], -1)**2 / (2 * self._sigma**2))

From d416522a820d8c0af21ce3c51e01be8fdbb2cc26 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 2/9] update

---
 xenonpy/descriptor/compositions.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index bfc4264..7dd7d28 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -28,13 +28,16 @@
 
 class KernelMean(BaseFeaturizer):
 
-    def __init__(
-        self,
-        kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]],
-        *,
-        feature_matrix: Union[None, pd.DataFrame] = None,
-        grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
-    ):
+    def __init__(self,
+                 kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]],
+                 *,
+                 feature_matrix: Union[None, pd.DataFrame] = None,
+                 grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
+                 n_jobs=-1,
+                 on_errors='raise',
+                 return_type='any',
+                 target_col='composition'):
+        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
 
         if feature_matrix is None:  # use elemental info
             feature_matrix = preset.elements_completed

From 8b280ac32f72d31304e878823651871ac3db13db Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 3/9] update

---
 xenonpy/descriptor/compositions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index 7dd7d28..043e524 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -75,11 +75,12 @@ def featurize(self, comp):
         if isinstance(comp, PMGComp):
             comp = comp.as_dict()
 
-        return sum([self._kernel_matrix.loc[e].values for e, n in comp.items()])
+        atoms = sum(comp.values())
+        return sum([self._kernel_matrix.loc[e].values * (n / atoms) for e, n in comp.items()])
 
     @property
     def feature_labels(self):
-        return self._labels
+        return self._kernel_matrix.columns
 
 
 class Counting(BaseCompositionFeaturizer):

From 190f5b6cb9c2d965cbf29840caa15bf418c97647 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 4/9] update

---
 xenonpy/descriptor/compositions.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index 043e524..b8dd315 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -33,11 +33,10 @@ def __init__(self,
                  *,
                  feature_matrix: Union[None, pd.DataFrame] = None,
                  grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
-                 n_jobs=-1,
                  on_errors='raise',
                  return_type='any',
                  target_col='composition'):
-        super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
+        super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col)
 
         if feature_matrix is None:  # use elemental info
             feature_matrix = preset.elements_completed
@@ -71,12 +70,23 @@ def __init__(self,
 
         self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels)
 
-    def featurize(self, comp):
-        if isinstance(comp, PMGComp):
-            comp = comp.as_dict()
+    def featurize(self, comps):
+        # Unified to python list
+        if isinstance(comps, (pd.Series, np.ndarray)):
+            comps = comps.tolist()
+
+        size = len(comps)
+        kernel_matrix = self._kernel_matrix
+        proportion_matrix = np.zeros((size, kernel_matrix.shape[0]))
+
+        for i, comp in enumerate(comps):
+            t = sum(comp.values())
+            for (k, v) in comp.items():
+                elem_i = kernel_matrix.index.get_loc(k)
+                proportion_matrix[i, elem_i] = v / t
 
-        atoms = sum(comp.values())
-        return sum([self._kernel_matrix.loc[e].values * (n / atoms) for e, n in comp.items()])
+        # fast way using matrix calculation
+        return (proportion_matrix.T[:, :, np.newaxis] @ (kernel_matrix.values)[:, np.newaxis, :]).sum(axis=0)
 
     @property
     def feature_labels(self):

From 252359d34fd9600845d8284d69516eb29faddfea Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 5/9] improve performance; add test

---
 ...est_elemental.py => test_compositional.py} | 22 ++++++++++++++--
 xenonpy/descriptor/compositions.py            | 25 ++++++++++++++-----
 2 files changed, 39 insertions(+), 8 deletions(-)
 rename tests/descriptor/{test_elemental.py => test_compositional.py} (74%)

diff --git a/tests/descriptor/test_elemental.py b/tests/descriptor/test_compositional.py
similarity index 74%
rename from tests/descriptor/test_elemental.py
rename to tests/descriptor/test_compositional.py
index 0c52731..e6e8729 100644
--- a/tests/descriptor/test_elemental.py
+++ b/tests/descriptor/test_compositional.py
@@ -6,11 +6,11 @@
 import pandas as pd
 import pytest
 
-from xenonpy.descriptor import Compositions, Counting
+from xenonpy.descriptor import Compositions, Counting, KernelMean, RBFKernel
 from xenonpy.descriptor.base import BaseCompositionFeaturizer
 
 
-def test_compositional_feature_1():
+def test_base_composition_1():
 
     class FakeFeaturizer(BaseCompositionFeaturizer):
 
@@ -71,5 +71,23 @@ def test_comp_descriptor_1():
     assert np.all(tmp1.values == tmp2.values)
 
 
+def test_kernel_mean_1():
+    comps = [{'H': 2}, {'Al': 3, 'Pd': 4}, {'C': 1, 'O': 5, 'H': 20}]
+
+    n_bins = 2
+    delta = 1 / (n_bins - 1)
+    kernel_mean = KernelMean(RBFKernel(sigma=delta * 0.4), grid=n_bins, n_jobs=1)
+    desc = kernel_mean.transform(comps)
+    assert desc.shape == (3, 116)
+    assert isinstance(desc, np.ndarray)
+
+    n_bins = 3
+    delta = 1 / (n_bins - 1)
+    kernel_mean = KernelMean(RBFKernel(sigma=delta * 0.4), grid=n_bins, n_jobs=1)
+    desc = kernel_mean.transform(pd.Series(comps))
+    assert desc.shape == (3, 174)
+    assert isinstance(desc, pd.DataFrame)
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index b8dd315..671344d 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+from joblib import Parallel, delayed
 from pymatgen.core import Composition as PMGComp
 from sklearn.preprocessing import MinMaxScaler
 from xenonpy.datatools import preset
@@ -35,7 +36,8 @@ def __init__(self,
                  grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
                  on_errors='raise',
                  return_type='any',
-                 target_col='composition'):
+                 target_col='composition',
+                 n_jobs: int = 1):
         super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col)
 
         if feature_matrix is None:  # use elemental info
@@ -69,23 +71,34 @@ def __init__(self,
             *[[f'{n}_k{k+1}' for k in range(g.size)] for n, g in zip(feature_matrix.columns, grid)])
 
         self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels)
+        self.__n_jobs = n_jobs  # this param should not overwrite the property of parent class
+        self.__authors__ = ['TsumiNa']
 
     def featurize(self, comps):
         # Unified to python list
         if isinstance(comps, (pd.Series, np.ndarray)):
             comps = comps.tolist()
 
-        size = len(comps)
         kernel_matrix = self._kernel_matrix
-        proportion_matrix = np.zeros((size, kernel_matrix.shape[0]))
 
-        for i, comp in enumerate(comps):
+        def inner(comp):
+            # unified to python dict
+            if isinstance(comp, PMGComp):
+                comp = comp.as_dict()
+
+            # calculate proportion vector for the given composition
             t = sum(comp.values())
+            proportion_vec = np.zeros(kernel_matrix.shape[0])
             for (k, v) in comp.items():
                 elem_i = kernel_matrix.index.get_loc(k)
-                proportion_matrix[i, elem_i] = v / t
+                proportion_vec[elem_i] = v / t
+
+            return proportion_vec
+
+        proportion_matrix = Parallel(n_jobs=self.__n_jobs)(delayed(inner)(comp) for comp in comps)
+        proportion_matrix = np.stack(proportion_matrix)
 
-        # fast way using matrix calculation
+        # fast way using dot calculation
         return (proportion_matrix.T[:, :, np.newaxis] @ (kernel_matrix.values)[:, np.newaxis, :]).sum(axis=0)
 
     @property

From cef56d6284bfe74817e71698c101701817840666 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 6/9] update

---
 xenonpy/descriptor/compositions.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index 671344d..cb22c6d 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -28,15 +28,18 @@
 
 
 class KernelMean(BaseFeaturizer):
+    """Add kernel mean descriptor."
+
+    """
 
     def __init__(self,
                  kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]],
                  *,
                  feature_matrix: Union[None, pd.DataFrame] = None,
                  grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
-                 on_errors='raise',
-                 return_type='any',
-                 target_col='composition',
+                 on_errors: str = 'raise',
+                 return_type: str = 'any',
+                 target_col: Union[List[str], str, None] = 'composition',
                  n_jobs: int = 1):
         super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col)
 

From 90e988724497f6b5d8947a8801cdab6a41549c34 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 7/9] update

---
 xenonpy/descriptor/kernel.py | 85 +++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 7 deletions(-)

diff --git a/xenonpy/descriptor/kernel.py b/xenonpy/descriptor/kernel.py
index ed085b4..feaf87d 100644
--- a/xenonpy/descriptor/kernel.py
+++ b/xenonpy/descriptor/kernel.py
@@ -13,15 +13,86 @@
 # limitations under the License.
 
 import numpy as np
+import pandas as pd
+from typing import Union, Sequence
+from sklearn.preprocessing import MinMaxScaler
+from xenonpy.datatools import preset
 
-__all__ = ['RBFKernel']
+__all__ = ['rbf_kernel']
 
 
-class RBFKernel():
+def rbf_kernel(x_i: np.ndarray, x_j: Union[np.ndarray, int, float], sigmas: Union[float, int, np.ndarray,
+                                                                                  Sequence]) -> np.ndarray:
+    """
+    Radial Basis Function (RBF) kernel function.
+    https://en.wikipedia.org/wiki/Radial_basis_function_kernel
 
-    def __init__(self, sigma):
-        self._sigma = sigma
+    Parameters
+    ----------
+    sigmas:
+        The standard deviations (SD).
+        Can be a single number or a 1d array-like object.
+    x_i:
+        Should be a 1d array.
+    x_j : np.ndarray
+        Should be a 1d array.
 
-    def __call__(self, x_i: np.ndarray, x_j: np.ndarray):
-        # K(x_i, x_j) = exp(-||x_i - x_j||^2 / (2 * sigma^2))
-        return np.exp(-(x_i[:, :, np.newaxis] - x_j).reshape(x_i.shape[0], -1)**2 / (2 * self._sigma**2))
+    Returns
+    -------
+    np.ndarray
+        Distribution under RBF kernel.
+
+    Raises
+    ------
+    ValueError
+        Raise error if sigmas has wrong dimension.
+    """
+    sigmas = np.asarray(sigmas)
+    if sigmas.ndim == 0:
+        sigmas = sigmas[np.newaxis]
+    if sigmas.ndim != 1:
+        raise ValueError('parameter `sigmas` must be a array-like object which has dimension 1')
+
+    # K(x_i, x_j) = exp(-||x_i - x_j||^2 / (2 * sigma^2))
+    p1 = np.power(np.expand_dims(x_i, axis=x_i.ndim) - x_j, 2)
+    p2 = np.power(sigmas, 2) * 2
+    dists = np.exp(-np.expand_dims(p1, axis=p1.ndim) / p2).transpose([2, 0, 1])
+
+    if dists.shape[0] == 1:
+        return dists[0]
+    return dists
+
+
+def calculate_rbf_kernel_matrix(
+    *,
+    element_info: Union[None, pd.DataFrame] = None,
+    quartiles: Sequence[int] = (25, 50, 75),
+    half_interval_by_sigma: float = 2,
+    sort_centers: bool = True,
+    scaled_element_info: bool = False,
+):
+    if element_info is None:
+        elem = preset.elements_completed
+
+    if scaled_element_info:
+        elem = pd.DataFrame(MinMaxScaler().fit_transform(elem), columns=elem.columns, index=elem.index)
+
+    all_dists = []
+    center_labels = []
+    for feature, data in elem.iteritems():
+        if sort_centers:
+            data = data.values
+            centers = np.unique(data)
+        else:
+            centers = data.unique()
+            data = data.values
+        intervals = np.unique([abs(i - j) for i, j in zip(data[:-1], data[1:])])  # get all intervals
+        quartiles = np.percentile(intervals / 2, [25, 50, 75])  # get 25%, 50%, 75% quantile of intervals / 2
+        sigmas = quartiles / half_interval_by_sigma  # use unique quantiles as sigma of RBF kernel
+
+        # RBF kernel
+        dists = rbf_kernel(data, centers, sigmas)
+        all_dists.append(dists)
+        center_labels.append(pd.Series(centers, index=[feature] * centers.size))
+
+    return np.concatenate(all_dists, axis=2), pd.concat(center_labels)

From 6029e4a3af24c9d387426565ff2fe285b2a813e7 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 8/9] update

---
 xenonpy/descriptor/compositions.py | 53 +++++++++++-------------------
 xenonpy/descriptor/kernel.py       | 24 +++++++-------
 2 files changed, 32 insertions(+), 45 deletions(-)

diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index cb22c6d..e71b874 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -2,8 +2,7 @@
 #  Use of this source code is governed by a BSD-style
 #  license that can be found in the LICENSE file.
 
-import itertools
-from typing import Callable, List, Sequence, Union
+from typing import Callable, List, Sequence, Union, Tuple
 
 import numpy as np
 import pandas as pd
@@ -11,6 +10,7 @@
 from pymatgen.core import Composition as PMGComp
 from sklearn.preprocessing import MinMaxScaler
 from xenonpy.datatools import preset
+from xenonpy.descriptor import calculate_rbf_kernel_matrix
 from xenonpy.descriptor.base import (BaseCompositionFeaturizer, BaseDescriptor, BaseFeaturizer)
 
 __all__ = [
@@ -33,45 +33,30 @@ class KernelMean(BaseFeaturizer):
     """
 
     def __init__(self,
-                 kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]],
                  *,
+                 kernel_matrix: Union[None, pd.DataFrame, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, str]]],
                  feature_matrix: Union[None, pd.DataFrame] = None,
-                 grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None,
                  on_errors: str = 'raise',
                  return_type: str = 'any',
-                 target_col: Union[List[str], str, None] = 'composition',
+                 target_col: Union[Sequence[str], str, None] = 'composition',
                  n_jobs: int = 1):
         super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col)
 
-        if feature_matrix is None:  # use elemental info
-            feature_matrix = preset.elements_completed
-
-        # re-scale to [0, 1]
-        scaled_matrix = MinMaxScaler().fit_transform(feature_matrix)
-
-        # calculate centers for each feature
-        if grid is None:
-            grid = scaled_matrix.values.mean(axis=0).reshape(-1, 1)  # use mean of feature as center
-        elif isinstance(grid, int):
-            grid = np.array([np.linspace(0, 1, grid)] * scaled_matrix.shape[1])  # create bins
-        elif isinstance(grid, Sequence):
-            grid = np.asarray(grid)
-            if grid.ndim == 1:
-                if grid.size != scaled_matrix.shape[1]:
-                    raise ValueError(
-                        f'length of grid ({grid.size}) must be equal to feature size ({scaled_matrix.shape[1]})')
-                grid = np.array([np.linspace(0, 1, grid) for i in grid])
-            elif grid.ndim == 2:
-                pass  # direct input
-            else:
-                raise ValueError('dim of grid must be 1 or 2')
-
-        # calculate kernel matrix for featrues
-        kernel_matrix = kernel_func(scaled_matrix, grid)
-
-        # generate column names of output
-        labels = itertools.chain(
-            *[[f'{n}_k{k+1}' for k in range(g.size)] for n, g in zip(feature_matrix.columns, grid)])
+        # prepare kernel matrix
+        if kernel_matrix is None:
+            if feature_matrix is None:  # use elemental info
+                feature_matrix = preset.elements_completed
+
+            # calculate kernel matrix
+            all_dists, labels = calculate_rbf_kernel_matrix(element_info=feature_matrix)
+            all_dists = [MinMaxScaler().fit_transform(np.sum(m, axis=0).T).T for m in all_dists
+                        ]  # MinMaxScale for each element
+            kernel_matrix, labels = np.concatenate(all_dists, axis=1), [f'{l}_{i}' for i, l in enumerate(labels.index)]
+        elif callable(kernel_matrix):
+            # calculate kernel matrix
+            kernel_matrix, labels = kernel_matrix(feature_matrix)
+        else:
+            kernel_matrix, labels = kernel_matrix.values, kernel_matrix.columns
 
         self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels)
         self.__n_jobs = n_jobs  # this param should not overwrite the property of parent class
diff --git a/xenonpy/descriptor/kernel.py b/xenonpy/descriptor/kernel.py
index feaf87d..3a96740 100644
--- a/xenonpy/descriptor/kernel.py
+++ b/xenonpy/descriptor/kernel.py
@@ -18,7 +18,7 @@
 from sklearn.preprocessing import MinMaxScaler
 from xenonpy.datatools import preset
 
-__all__ = ['rbf_kernel']
+__all__ = ['rbf_kernel' 'calculate_rbf_kernel_matrix']
 
 
 def rbf_kernel(x_i: np.ndarray, x_j: Union[np.ndarray, int, float], sigmas: Union[float, int, np.ndarray,
@@ -64,22 +64,24 @@ def rbf_kernel(x_i: np.ndarray, x_j: Union[np.ndarray, int, float], sigmas: Unio
 
 
 def calculate_rbf_kernel_matrix(
-    *,
-    element_info: Union[None, pd.DataFrame] = None,
-    quartiles: Sequence[int] = (25, 50, 75),
-    half_interval_by_sigma: float = 2,
-    sort_centers: bool = True,
-    scaled_element_info: bool = False,
+        *,
+        element_info: Union[None, pd.DataFrame] = None,
+        scaled_element_info: bool = False,
+        quartiles: Sequence[int] = (25, 50, 75),
+        half_interval_by_sigma: float = 2,
+        sort_centers: bool = True,
 ):
     if element_info is None:
-        elem = preset.elements_completed
+        element_info = preset.elements_completed
 
     if scaled_element_info:
-        elem = pd.DataFrame(MinMaxScaler().fit_transform(elem), columns=elem.columns, index=elem.index)
+        element_info = pd.DataFrame(MinMaxScaler().fit_transform(element_info),
+                                    columns=element_info.columns,
+                                    index=element_info.index)
 
     all_dists = []
     center_labels = []
-    for feature, data in elem.iteritems():
+    for feature, data in element_info.iteritems():
         if sort_centers:
             data = data.values
             centers = np.unique(data)
@@ -95,4 +97,4 @@ def calculate_rbf_kernel_matrix(
         all_dists.append(dists)
         center_labels.append(pd.Series(centers, index=[feature] * centers.size))
 
-    return np.concatenate(all_dists, axis=2), pd.concat(center_labels)
+    return all_dists, pd.concat(center_labels)

From 534f7ca7aa05b5ca82e3d75126518a3693fb97a2 Mon Sep 17 00:00:00 2001
From: TsumiNa <liu.chang.1865@gmail.com>
Date: Sun, 21 Aug 2022 14:03:39 +0900
Subject: [PATCH 9/9] fix label generation

---
 xenonpy/descriptor/compositions.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py
index e71b874..709d5ea 100644
--- a/xenonpy/descriptor/compositions.py
+++ b/xenonpy/descriptor/compositions.py
@@ -2,6 +2,7 @@
 #  Use of this source code is governed by a BSD-style
 #  license that can be found in the LICENSE file.
 
+from itertools import count
 from typing import Callable, List, Sequence, Union, Tuple
 
 import numpy as np
@@ -34,7 +35,8 @@ class KernelMean(BaseFeaturizer):
 
     def __init__(self,
                  *,
-                 kernel_matrix: Union[None, pd.DataFrame, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, str]]],
+                 kernel_matrix: Union[None, pd.DataFrame, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray,
+                                                                                                   str]]] = None,
                  feature_matrix: Union[None, pd.DataFrame] = None,
                  on_errors: str = 'raise',
                  return_type: str = 'any',
@@ -51,7 +53,9 @@ def __init__(self,
             all_dists, labels = calculate_rbf_kernel_matrix(element_info=feature_matrix)
             all_dists = [MinMaxScaler().fit_transform(np.sum(m, axis=0).T).T for m in all_dists
                         ]  # MinMaxScale for each element
-            kernel_matrix, labels = np.concatenate(all_dists, axis=1), [f'{l}_{i}' for i, l in enumerate(labels.index)]
+            count_mapper = {k: count() for k in labels.index.unique()}
+            kernel_matrix, labels = np.concatenate(all_dists,
+                                                   axis=1), [f'{l}_{count_mapper[l].__next__()}' for l in labels.index]
         elif callable(kernel_matrix):
             # calculate kernel matrix
             kernel_matrix, labels = kernel_matrix(feature_matrix)