From 36b881b5be84cfd8872adaad4f92a783e16e7fe4 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 1/9] add codes --- xenonpy/descriptor/__init__.py | 1 + xenonpy/descriptor/compositions.py | 115 +++++++++++++++++++---------- xenonpy/descriptor/kernel.py | 27 +++++++ 3 files changed, 106 insertions(+), 37 deletions(-) create mode 100644 xenonpy/descriptor/kernel.py diff --git a/xenonpy/descriptor/__init__.py b/xenonpy/descriptor/__init__.py index de156fe..28d7fb4 100644 --- a/xenonpy/descriptor/__init__.py +++ b/xenonpy/descriptor/__init__.py @@ -7,3 +7,4 @@ from .fingerprint import * from .frozen_featurizer import * from .structure import * +from .kernel import * diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index 43c61c1..bfc4264 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -2,28 +2,86 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -from typing import Union, List +import itertools +from typing import Callable, List, Sequence, Union import numpy as np import pandas as pd - -from xenonpy.descriptor.base import BaseDescriptor, BaseCompositionFeaturizer +from pymatgen.core import Composition as PMGComp +from sklearn.preprocessing import MinMaxScaler +from xenonpy.datatools import preset +from xenonpy.descriptor.base import (BaseCompositionFeaturizer, BaseDescriptor, BaseFeaturizer) __all__ = [ - 'Compositions', 'Counting', 'WeightedAverage', 'WeightedSum', 'WeightedVariance', - 'HarmonicMean', 'GeometricMean', 'MaxPooling', 'MinPooling' + 'Compositions', + 'Counting', + 'WeightedAverage', + 'WeightedSum', + 'WeightedVariance', + 'HarmonicMean', + 'GeometricMean', + 'MaxPooling', + 'MinPooling', + 'KernelMean', ] +class KernelMean(BaseFeaturizer): + + def __init__( + self, + kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]], + *, + feature_matrix: Union[None, pd.DataFrame] = None, + grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, + ): + + if feature_matrix is None: # use elemental info + feature_matrix = preset.elements_completed + + # re-scale to [0, 1] + scaled_matrix = MinMaxScaler().fit_transform(feature_matrix) + + # calculate centers for each feature + if grid is None: + grid = scaled_matrix.values.mean(axis=0).reshape(-1, 1) # use mean of feature as center + elif isinstance(grid, int): + grid = np.array([np.linspace(0, 1, grid)] * scaled_matrix.shape[1]) # create bins + elif isinstance(grid, Sequence): + grid = np.asarray(grid) + if grid.ndim == 1: + if grid.size != scaled_matrix.shape[1]: + raise ValueError( + f'length of grid ({grid.size}) must be equal to feature size ({scaled_matrix.shape[1]})') + grid = np.array([np.linspace(0, 1, grid) for i in grid]) + elif grid.ndim == 2: + pass # direct input + else: + raise ValueError('dim of grid must be 1 or 2') + + # calculate kernel matrix for featrues + kernel_matrix = kernel_func(scaled_matrix, grid) + + # generate column names of output + labels = itertools.chain( + *[[f'{n}_k{k+1}' for k in range(g.size)] for n, g in zip(feature_matrix.columns, grid)]) + + self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels) + + def featurize(self, comp): + if isinstance(comp, PMGComp): + comp = comp.as_dict() + + return sum([self._kernel_matrix.loc[e].values for e, n in comp.items()]) + + @property + def feature_labels(self): + return self._labels + + class Counting(BaseCompositionFeaturizer): - def __init__(self, - *, - one_hot_vec=False, - n_jobs=-1, - on_errors='raise', - return_type='any', - target_col=None): + def __init__(self, *, one_hot_vec=False, n_jobs=-1, on_errors='raise', return_type='any', target_col=None): """ Parameters @@ -53,10 +111,7 @@ def __init__(self, Default is None. """ - super().__init__(n_jobs=n_jobs, - on_errors=on_errors, - return_type=return_type, - target_col=target_col) + super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) self.one_hot_vec = one_hot_vec self._elems = self.elements.index.tolist() self.__authors__ = ['TsumiNa'] @@ -410,24 +465,10 @@ def __init__(self, super().__init__(featurizers=featurizers) self.composition = Counting(n_jobs=n_jobs, on_errors=on_errors) - self.composition = WeightedAverage(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) - self.composition = WeightedSum(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) - self.composition = WeightedVariance(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) - self.composition = GeometricMean(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) - self.composition = HarmonicMean(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) - self.composition = MaxPooling(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) - self.composition = MinPooling(n_jobs=n_jobs, - on_errors=on_errors, - elemental_info=elemental_info) + self.composition = WeightedAverage(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) + self.composition = WeightedSum(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) + self.composition = WeightedVariance(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) + self.composition = GeometricMean(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) + self.composition = HarmonicMean(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) + self.composition = MaxPooling(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) + self.composition = MinPooling(n_jobs=n_jobs, on_errors=on_errors, elemental_info=elemental_info) diff --git a/xenonpy/descriptor/kernel.py b/xenonpy/descriptor/kernel.py new file mode 100644 index 0000000..ed085b4 --- /dev/null +++ b/xenonpy/descriptor/kernel.py @@ -0,0 +1,27 @@ +# Copyright 2021 TsumiNa +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +__all__ = ['RBFKernel'] + + +class RBFKernel(): + + def __init__(self, sigma): + self._sigma = sigma + + def __call__(self, x_i: np.ndarray, x_j: np.ndarray): + # K(x_i, x_j) = exp(-||x_i - x_j||^2 / (2 * sigma^2)) + return np.exp(-(x_i[:, :, np.newaxis] - x_j).reshape(x_i.shape[0], -1)**2 / (2 * self._sigma**2)) From d416522a820d8c0af21ce3c51e01be8fdbb2cc26 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 2/9] update --- xenonpy/descriptor/compositions.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index bfc4264..7dd7d28 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -28,13 +28,16 @@ class KernelMean(BaseFeaturizer): - def __init__( - self, - kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]], - *, - feature_matrix: Union[None, pd.DataFrame] = None, - grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, - ): + def __init__(self, + kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]], + *, + feature_matrix: Union[None, pd.DataFrame] = None, + grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, + n_jobs=-1, + on_errors='raise', + return_type='any', + target_col='composition'): + super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) if feature_matrix is None: # use elemental info feature_matrix = preset.elements_completed From 8b280ac32f72d31304e878823651871ac3db13db Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 3/9] update --- xenonpy/descriptor/compositions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index 7dd7d28..043e524 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -75,11 +75,12 @@ def featurize(self, comp): if isinstance(comp, PMGComp): comp = comp.as_dict() - return sum([self._kernel_matrix.loc[e].values for e, n in comp.items()]) + atoms = sum(comp.values()) + return sum([self._kernel_matrix.loc[e].values * (n / atoms) for e, n in comp.items()]) @property def feature_labels(self): - return self._labels + return self._kernel_matrix.columns class Counting(BaseCompositionFeaturizer): From 190f5b6cb9c2d965cbf29840caa15bf418c97647 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 4/9] update --- xenonpy/descriptor/compositions.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index 043e524..b8dd315 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -33,11 +33,10 @@ def __init__(self, *, feature_matrix: Union[None, pd.DataFrame] = None, grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, - n_jobs=-1, on_errors='raise', return_type='any', target_col='composition'): - super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col) + super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col) if feature_matrix is None: # use elemental info feature_matrix = preset.elements_completed @@ -71,12 +70,23 @@ def __init__(self, self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels) - def featurize(self, comp): - if isinstance(comp, PMGComp): - comp = comp.as_dict() + def featurize(self, comps): + # Unified to python list + if isinstance(comps, (pd.Series, np.ndarray)): + comps = comps.tolist() + + size = len(comps) + kernel_matrix = self._kernel_matrix + proportion_matrix = np.zeros((size, kernel_matrix.shape[0])) + + for i, comp in enumerate(comps): + t = sum(comp.values()) + for (k, v) in comp.items(): + elem_i = kernel_matrix.index.get_loc(k) + proportion_matrix[i, elem_i] = v / t - atoms = sum(comp.values()) - return sum([self._kernel_matrix.loc[e].values * (n / atoms) for e, n in comp.items()]) + # fast way using matrix calculation + return (proportion_matrix.T[:, :, np.newaxis] @ (kernel_matrix.values)[:, np.newaxis, :]).sum(axis=0) @property def feature_labels(self): From 252359d34fd9600845d8284d69516eb29faddfea Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 5/9] improve performance; add test --- ...est_elemental.py => test_compositional.py} | 22 ++++++++++++++-- xenonpy/descriptor/compositions.py | 25 ++++++++++++++----- 2 files changed, 39 insertions(+), 8 deletions(-) rename tests/descriptor/{test_elemental.py => test_compositional.py} (74%) diff --git a/tests/descriptor/test_elemental.py b/tests/descriptor/test_compositional.py similarity index 74% rename from tests/descriptor/test_elemental.py rename to tests/descriptor/test_compositional.py index 0c52731..e6e8729 100644 --- a/tests/descriptor/test_elemental.py +++ b/tests/descriptor/test_compositional.py @@ -6,11 +6,11 @@ import pandas as pd import pytest -from xenonpy.descriptor import Compositions, Counting +from xenonpy.descriptor import Compositions, Counting, KernelMean, RBFKernel from xenonpy.descriptor.base import BaseCompositionFeaturizer -def test_compositional_feature_1(): +def test_base_composition_1(): class FakeFeaturizer(BaseCompositionFeaturizer): @@ -71,5 +71,23 @@ def test_comp_descriptor_1(): assert np.all(tmp1.values == tmp2.values) +def test_kernel_mean_1(): + comps = [{'H': 2}, {'Al': 3, 'Pd': 4}, {'C': 1, 'O': 5, 'H': 20}] + + n_bins = 2 + delta = 1 / (n_bins - 1) + kernel_mean = KernelMean(RBFKernel(sigma=delta * 0.4), grid=n_bins, n_jobs=1) + desc = kernel_mean.transform(comps) + assert desc.shape == (3, 116) + assert isinstance(desc, np.ndarray) + + n_bins = 3 + delta = 1 / (n_bins - 1) + kernel_mean = KernelMean(RBFKernel(sigma=delta * 0.4), grid=n_bins, n_jobs=1) + desc = kernel_mean.transform(pd.Series(comps)) + assert desc.shape == (3, 174) + assert isinstance(desc, pd.DataFrame) + + if __name__ == "__main__": pytest.main() diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index b8dd315..671344d 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from joblib import Parallel, delayed from pymatgen.core import Composition as PMGComp from sklearn.preprocessing import MinMaxScaler from xenonpy.datatools import preset @@ -35,7 +36,8 @@ def __init__(self, grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, on_errors='raise', return_type='any', - target_col='composition'): + target_col='composition', + n_jobs: int = 1): super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col) if feature_matrix is None: # use elemental info @@ -69,23 +71,34 @@ def __init__(self, *[[f'{n}_k{k+1}' for k in range(g.size)] for n, g in zip(feature_matrix.columns, grid)]) self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels) + self.__n_jobs = n_jobs # this param should not overwrite the property of parent class + self.__authors__ = ['TsumiNa'] def featurize(self, comps): # Unified to python list if isinstance(comps, (pd.Series, np.ndarray)): comps = comps.tolist() - size = len(comps) kernel_matrix = self._kernel_matrix - proportion_matrix = np.zeros((size, kernel_matrix.shape[0])) - for i, comp in enumerate(comps): + def inner(comp): + # unified to python dict + if isinstance(comp, PMGComp): + comp = comp.as_dict() + + # calculate proportion vector for the given composition t = sum(comp.values()) + proportion_vec = np.zeros(kernel_matrix.shape[0]) for (k, v) in comp.items(): elem_i = kernel_matrix.index.get_loc(k) - proportion_matrix[i, elem_i] = v / t + proportion_vec[elem_i] = v / t + + return proportion_vec + + proportion_matrix = Parallel(n_jobs=self.__n_jobs)(delayed(inner)(comp) for comp in comps) + proportion_matrix = np.stack(proportion_matrix) - # fast way using matrix calculation + # fast way using dot calculation return (proportion_matrix.T[:, :, np.newaxis] @ (kernel_matrix.values)[:, np.newaxis, :]).sum(axis=0) @property From cef56d6284bfe74817e71698c101701817840666 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 6/9] update --- xenonpy/descriptor/compositions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index 671344d..cb22c6d 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -28,15 +28,18 @@ class KernelMean(BaseFeaturizer): + """Add kernel mean descriptor." + + """ def __init__(self, kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]], *, feature_matrix: Union[None, pd.DataFrame] = None, grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, - on_errors='raise', - return_type='any', - target_col='composition', + on_errors: str = 'raise', + return_type: str = 'any', + target_col: Union[List[str], str, None] = 'composition', n_jobs: int = 1): super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col) From 90e988724497f6b5d8947a8801cdab6a41549c34 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 7/9] update --- xenonpy/descriptor/kernel.py | 85 +++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 7 deletions(-) diff --git a/xenonpy/descriptor/kernel.py b/xenonpy/descriptor/kernel.py index ed085b4..feaf87d 100644 --- a/xenonpy/descriptor/kernel.py +++ b/xenonpy/descriptor/kernel.py @@ -13,15 +13,86 @@ # limitations under the License. import numpy as np +import pandas as pd +from typing import Union, Sequence +from sklearn.preprocessing import MinMaxScaler +from xenonpy.datatools import preset -__all__ = ['RBFKernel'] +__all__ = ['rbf_kernel'] -class RBFKernel(): +def rbf_kernel(x_i: np.ndarray, x_j: Union[np.ndarray, int, float], sigmas: Union[float, int, np.ndarray, + Sequence]) -> np.ndarray: + """ + Radial Basis Function (RBF) kernel function. + https://en.wikipedia.org/wiki/Radial_basis_function_kernel - def __init__(self, sigma): - self._sigma = sigma + Parameters + ---------- + sigmas: + The standard deviations (SD). + Can be a single number or a 1d array-like object. + x_i: + Should be a 1d array. + x_j : np.ndarray + Should be a 1d array. - def __call__(self, x_i: np.ndarray, x_j: np.ndarray): - # K(x_i, x_j) = exp(-||x_i - x_j||^2 / (2 * sigma^2)) - return np.exp(-(x_i[:, :, np.newaxis] - x_j).reshape(x_i.shape[0], -1)**2 / (2 * self._sigma**2)) + Returns + ------- + np.ndarray + Distribution under RBF kernel. + + Raises + ------ + ValueError + Raise error if sigmas has wrong dimension. + """ + sigmas = np.asarray(sigmas) + if sigmas.ndim == 0: + sigmas = sigmas[np.newaxis] + if sigmas.ndim != 1: + raise ValueError('parameter `sigmas` must be a array-like object which has dimension 1') + + # K(x_i, x_j) = exp(-||x_i - x_j||^2 / (2 * sigma^2)) + p1 = np.power(np.expand_dims(x_i, axis=x_i.ndim) - x_j, 2) + p2 = np.power(sigmas, 2) * 2 + dists = np.exp(-np.expand_dims(p1, axis=p1.ndim) / p2).transpose([2, 0, 1]) + + if dists.shape[0] == 1: + return dists[0] + return dists + + +def calculate_rbf_kernel_matrix( + *, + element_info: Union[None, pd.DataFrame] = None, + quartiles: Sequence[int] = (25, 50, 75), + half_interval_by_sigma: float = 2, + sort_centers: bool = True, + scaled_element_info: bool = False, +): + if element_info is None: + elem = preset.elements_completed + + if scaled_element_info: + elem = pd.DataFrame(MinMaxScaler().fit_transform(elem), columns=elem.columns, index=elem.index) + + all_dists = [] + center_labels = [] + for feature, data in elem.iteritems(): + if sort_centers: + data = data.values + centers = np.unique(data) + else: + centers = data.unique() + data = data.values + intervals = np.unique([abs(i - j) for i, j in zip(data[:-1], data[1:])]) # get all intervals + quartiles = np.percentile(intervals / 2, [25, 50, 75]) # get 25%, 50%, 75% quantile of intervals / 2 + sigmas = quartiles / half_interval_by_sigma # use unique quantiles as sigma of RBF kernel + + # RBF kernel + dists = rbf_kernel(data, centers, sigmas) + all_dists.append(dists) + center_labels.append(pd.Series(centers, index=[feature] * centers.size)) + + return np.concatenate(all_dists, axis=2), pd.concat(center_labels) From 6029e4a3af24c9d387426565ff2fe285b2a813e7 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 8/9] update --- xenonpy/descriptor/compositions.py | 53 +++++++++++------------------- xenonpy/descriptor/kernel.py | 24 +++++++------- 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index cb22c6d..e71b874 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -2,8 +2,7 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -import itertools -from typing import Callable, List, Sequence, Union +from typing import Callable, List, Sequence, Union, Tuple import numpy as np import pandas as pd @@ -11,6 +10,7 @@ from pymatgen.core import Composition as PMGComp from sklearn.preprocessing import MinMaxScaler from xenonpy.datatools import preset +from xenonpy.descriptor import calculate_rbf_kernel_matrix from xenonpy.descriptor.base import (BaseCompositionFeaturizer, BaseDescriptor, BaseFeaturizer) __all__ = [ @@ -33,45 +33,30 @@ class KernelMean(BaseFeaturizer): """ def __init__(self, - kernel_func: Union[None, Callable[[np.ndarray, np.ndarray], np.ndarray]], *, + kernel_matrix: Union[None, pd.DataFrame, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, str]]], feature_matrix: Union[None, pd.DataFrame] = None, - grid: Union[None, int, Sequence[int], Sequence[Sequence[float]]] = None, on_errors: str = 'raise', return_type: str = 'any', - target_col: Union[List[str], str, None] = 'composition', + target_col: Union[Sequence[str], str, None] = 'composition', n_jobs: int = 1): super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type, target_col=target_col) - if feature_matrix is None: # use elemental info - feature_matrix = preset.elements_completed - - # re-scale to [0, 1] - scaled_matrix = MinMaxScaler().fit_transform(feature_matrix) - - # calculate centers for each feature - if grid is None: - grid = scaled_matrix.values.mean(axis=0).reshape(-1, 1) # use mean of feature as center - elif isinstance(grid, int): - grid = np.array([np.linspace(0, 1, grid)] * scaled_matrix.shape[1]) # create bins - elif isinstance(grid, Sequence): - grid = np.asarray(grid) - if grid.ndim == 1: - if grid.size != scaled_matrix.shape[1]: - raise ValueError( - f'length of grid ({grid.size}) must be equal to feature size ({scaled_matrix.shape[1]})') - grid = np.array([np.linspace(0, 1, grid) for i in grid]) - elif grid.ndim == 2: - pass # direct input - else: - raise ValueError('dim of grid must be 1 or 2') - - # calculate kernel matrix for featrues - kernel_matrix = kernel_func(scaled_matrix, grid) - - # generate column names of output - labels = itertools.chain( - *[[f'{n}_k{k+1}' for k in range(g.size)] for n, g in zip(feature_matrix.columns, grid)]) + # prepare kernel matrix + if kernel_matrix is None: + if feature_matrix is None: # use elemental info + feature_matrix = preset.elements_completed + + # calculate kernel matrix + all_dists, labels = calculate_rbf_kernel_matrix(element_info=feature_matrix) + all_dists = [MinMaxScaler().fit_transform(np.sum(m, axis=0).T).T for m in all_dists + ] # MinMaxScale for each element + kernel_matrix, labels = np.concatenate(all_dists, axis=1), [f'{l}_{i}' for i, l in enumerate(labels.index)] + elif callable(kernel_matrix): + # calculate kernel matrix + kernel_matrix, labels = kernel_matrix(feature_matrix) + else: + kernel_matrix, labels = kernel_matrix.values, kernel_matrix.columns self._kernel_matrix = pd.DataFrame(kernel_matrix, index=feature_matrix.index, columns=labels) self.__n_jobs = n_jobs # this param should not overwrite the property of parent class diff --git a/xenonpy/descriptor/kernel.py b/xenonpy/descriptor/kernel.py index feaf87d..3a96740 100644 --- a/xenonpy/descriptor/kernel.py +++ b/xenonpy/descriptor/kernel.py @@ -18,7 +18,7 @@ from sklearn.preprocessing import MinMaxScaler from xenonpy.datatools import preset -__all__ = ['rbf_kernel'] +__all__ = ['rbf_kernel' 'calculate_rbf_kernel_matrix'] def rbf_kernel(x_i: np.ndarray, x_j: Union[np.ndarray, int, float], sigmas: Union[float, int, np.ndarray, @@ -64,22 +64,24 @@ def rbf_kernel(x_i: np.ndarray, x_j: Union[np.ndarray, int, float], sigmas: Unio def calculate_rbf_kernel_matrix( - *, - element_info: Union[None, pd.DataFrame] = None, - quartiles: Sequence[int] = (25, 50, 75), - half_interval_by_sigma: float = 2, - sort_centers: bool = True, - scaled_element_info: bool = False, + *, + element_info: Union[None, pd.DataFrame] = None, + scaled_element_info: bool = False, + quartiles: Sequence[int] = (25, 50, 75), + half_interval_by_sigma: float = 2, + sort_centers: bool = True, ): if element_info is None: - elem = preset.elements_completed + element_info = preset.elements_completed if scaled_element_info: - elem = pd.DataFrame(MinMaxScaler().fit_transform(elem), columns=elem.columns, index=elem.index) + element_info = pd.DataFrame(MinMaxScaler().fit_transform(element_info), + columns=element_info.columns, + index=element_info.index) all_dists = [] center_labels = [] - for feature, data in elem.iteritems(): + for feature, data in element_info.iteritems(): if sort_centers: data = data.values centers = np.unique(data) @@ -95,4 +97,4 @@ def calculate_rbf_kernel_matrix( all_dists.append(dists) center_labels.append(pd.Series(centers, index=[feature] * centers.size)) - return np.concatenate(all_dists, axis=2), pd.concat(center_labels) + return all_dists, pd.concat(center_labels) From 534f7ca7aa05b5ca82e3d75126518a3693fb97a2 Mon Sep 17 00:00:00 2001 From: TsumiNa Date: Sun, 21 Aug 2022 14:03:39 +0900 Subject: [PATCH 9/9] fix label generation --- xenonpy/descriptor/compositions.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xenonpy/descriptor/compositions.py b/xenonpy/descriptor/compositions.py index e71b874..709d5ea 100644 --- a/xenonpy/descriptor/compositions.py +++ b/xenonpy/descriptor/compositions.py @@ -2,6 +2,7 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +from itertools import count from typing import Callable, List, Sequence, Union, Tuple import numpy as np @@ -34,7 +35,8 @@ class KernelMean(BaseFeaturizer): def __init__(self, *, - kernel_matrix: Union[None, pd.DataFrame, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, str]]], + kernel_matrix: Union[None, pd.DataFrame, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, + str]]] = None, feature_matrix: Union[None, pd.DataFrame] = None, on_errors: str = 'raise', return_type: str = 'any', @@ -51,7 +53,9 @@ def __init__(self, all_dists, labels = calculate_rbf_kernel_matrix(element_info=feature_matrix) all_dists = [MinMaxScaler().fit_transform(np.sum(m, axis=0).T).T for m in all_dists ] # MinMaxScale for each element - kernel_matrix, labels = np.concatenate(all_dists, axis=1), [f'{l}_{i}' for i, l in enumerate(labels.index)] + count_mapper = {k: count() for k in labels.index.unique()} + kernel_matrix, labels = np.concatenate(all_dists, + axis=1), [f'{l}_{count_mapper[l].__next__()}' for l in labels.index] elif callable(kernel_matrix): # calculate kernel matrix kernel_matrix, labels = kernel_matrix(feature_matrix)