From 922b2b7924085d91410ecfdc86e950373477a4d6 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Mon, 27 Jun 2022 12:04:23 +0200 Subject: [PATCH 01/10] Upgraded pymatgen and matminer requirements --- README.md | 6 ------ modnet/featurizers/featurizers.py | 8 ++++---- modnet/preprocessing.py | 10 +++++----- setup.py | 8 ++++---- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d4ffd405..72a4b761 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,6 @@ activate the environment: conda activate modnet ``` -Then, install pymatgen v2020.8.13 with conda, which will bundle several pre-built dependencies (e.g., numpy, scipy): - -```shell -conda install -c conda-forge pymatgen=2020.8.13 -``` - Finally, install MODNet from PyPI with pip: ```bash diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 0835668c..0fd3ec77 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -70,7 +70,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: The featurized DataFrame. @@ -137,7 +137,7 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame, or an empty @@ -184,7 +184,7 @@ def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. @@ -206,7 +206,7 @@ def featurize_site( Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. aliases: optional dictionary to map matminer output column names to new aliases, mostly used for backwards-compatibility. diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 8cf3bed5..7b888eee 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -13,7 +13,7 @@ from typing import Dict, List, Union, Optional, Callable, Hashable, Iterable, Tuple from functools import partial -from pymatgen import Structure, Composition +from pymatgen.core import Structure, Composition from sklearn.feature_selection import mutual_info_regression, mutual_info_classif from sklearn.utils import resample @@ -539,14 +539,14 @@ def merge_ranked(lists: List[List[Hashable]]) -> List[Hashable]: class MODData: - """The MODData class takes takes a list of `pymatgen.Structure` + """The MODData class takes takes a list of `pymatgen.core.structure.Structure` objects and creates a `pandas.DataFrame` that contains many matminer features per structure. It then uses mutual information between features and targets, and between the features themselves, to perform feature selection using relevance-redundancy indices. Attributes: - df_structure (pd.DataFrame): dataframe storing the `pymatgen.Structure` + df_structure (pd.DataFrame): dataframe storing the `pymatgen.core.structure.Structure` representations for each structured, indexed by ID. df_targets (pd.Dataframe): dataframe storing the prediction targets per structure, indexed by ID. @@ -906,12 +906,12 @@ def rebalance(self): @property def structures(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of `pymatgen.Structure` objects.""" + """Returns the list of `pymatgen.core.structure.Structure` objects.""" return list(self.df_structure["structure"]) @property def compositions(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of materials as`pymatgen.Composition` objects.""" + """Returns the list of materials as`pymatgen.core.composition.Composition` objects.""" return [s.composition for s in self.df_structure["structure"]] @property diff --git a/setup.py b/setup.py index 45d311d3..bb05700a 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,10 @@ "pandas>=0.25.3", "tensorflow>=2.4", "tensorflow-probability>=0.12", - "pymatgen>=2020,<2020.9", - "matminer>=0.6.2", - "numpy>=1.18.3", - "scikit-learn>=0.23,<0.24", + "pymatgen>=2022.5.17", + "matminer>=0.7.6", + "numpy>=1.22.3", + "scikit-learn>=1.1.0", ], tests_require=tests_require, test_suite="modnet.tests", From e2d5925c6a1bd6fdd1f1b5b338e7df40dded7cc0 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Thu, 23 Feb 2023 16:36:11 +0100 Subject: [PATCH 02/10] Added a preset with updated matminer featurizers for the composition (all of them). Also added an option to include only the featurizers that are continuous w.r.t. the composition. --- modnet/featurizers/featurizers.py | 39 +- modnet/featurizers/presets/__init__.py | 3 + modnet/featurizers/presets/matminer_2023.py | 2 +- .../featurizers/presets/matminer_all_2023.py | 341 ++++++++++++++++++ modnet/preprocessing.py | 4 +- 5 files changed, 377 insertions(+), 12 deletions(-) create mode 100644 modnet/featurizers/presets/matminer_all_2023.py diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index ca8b20c7..ad8636c8 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -2,6 +2,7 @@ from typing import Optional, Iterable, Tuple, Dict import pandas as pd +from pymatgen.core import Composition from matminer.featurizers.base import MultipleFeaturizer, BaseFeaturizer from matminer.featurizers.structure import SiteStatsFingerprint @@ -40,7 +41,9 @@ class MODFeaturizer(abc.ABC): """ + composition_continuous_featurizers: Optional[Iterable[BaseFeaturizer]] = None composition_featurizers: Optional[Iterable[BaseFeaturizer]] = None + oxid_composition_continuous_featurizers: Optional[Iterable[BaseFeaturizer]] = None oxid_composition_featurizers: Optional[Iterable[BaseFeaturizer]] = None structure_featurizers: Optional[Iterable[BaseFeaturizer]] = None site_featurizers: Optional[Iterable[BaseFeaturizer]] = None @@ -81,7 +84,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: """ df_composition = pd.DataFrame([]) - if self.composition_featurizers or self.oxid_composition_featurizers: + if self.composition_featurizers or self.oxid_composition_featurizers or self.composition_continuous_featurizers: df_composition = self.featurize_composition(df) df_structure = pd.DataFrame([]) @@ -189,32 +192,52 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: df = df.copy() - if self.composition_featurizers: + if self.composition_featurizers or self.composition_continuous_featurizers: LOG.info("Applying composition featurizers...") df["composition"] = df["structure"].apply(lambda s: s.composition) df = self._fit_apply_featurizers( df, - self.composition_featurizers, + self.composition_featurizers or self.composition_continuous_featurizers, "composition", mode=self.featurizer_mode, ) df = df.rename(columns={"Input Data": ""}) df.columns = df.columns.map("|".join).str.strip("|") - if self.oxid_composition_featurizers: + if self.oxid_composition_featurizers or self.oxid_composition_continuous_featurizers: LOG.info("Applying oxidation state featurizers...") + # Get integer composition if some are not + col_comp = "composition" + if not all(all(amt == int(amt) for amt in comp.values()) for comp in df["composition"].values): + LOG.info("There are non-integer compositions in the dataset, and featurizers that need them. " + "Computing...") + df["integer_composition"] = [ + Composition(comp.get_integer_formula_and_factor( + max_denominator=5 if getattr(self, "fast_oxid", False) else 100)[0] + ) + for comp in df["composition"].values + ] + # df["integer_composition"] = df["composition"].apply( + # lambda c: c.get_integer_formula_and_factor( + # max_denominator=10 if getattr(self, "fast_oxid", False) else 100 + # )[0] + # ) + + col_comp = "integer_composition" if getattr(self, "fast_oxid", False): df = CompositionToOxidComposition( all_oxi_states=False, max_sites=-1 - ).featurize_dataframe(df, "composition") + ).featurize_dataframe(df, col_id=col_comp) else: - df = CompositionToOxidComposition().featurize_dataframe( - df, "composition" + df = CompositionToOxidComposition( + max_sites=-1 if self.oxid_composition_continuous_featurizers else None + ).featurize_dataframe( + df, col_id=col_comp, ignore_errors=True ) df = self._fit_apply_featurizers( df, - self.oxid_composition_featurizers, + self.oxid_composition_featurizers or self.oxid_composition_continuous_featurizers, "composition_oxid", mode=self.featurizer_mode, ) diff --git a/modnet/featurizers/presets/__init__.py b/modnet/featurizers/presets/__init__.py index f1417fb7..f39fa070 100644 --- a/modnet/featurizers/presets/__init__.py +++ b/modnet/featurizers/presets/__init__.py @@ -7,6 +7,7 @@ from typing import Dict, Type from .debreuck_2020 import DeBreuck2020Featurizer, CompositionOnlyFeaturizer from .matminer_2023 import Matminer2023Featurizer, CompositionOnlyMatminer2023Featurizer +from .matminer_all_2023 import MatminerAll2023Featurizer, CompositionOnlyMatminerAll2023Featurizer from modnet.featurizers import MODFeaturizer DEFAULT_FEATURIZER: str = "Matminer2023" @@ -16,5 +17,7 @@ "DeBreuck2020": DeBreuck2020Featurizer, "CompositionOnly": CompositionOnlyFeaturizer, "Matminer2023": Matminer2023Featurizer, + "MatminerAll2023": MatminerAll2023Featurizer, "CompositionOnlyMatminer2023": CompositionOnlyMatminer2023Featurizer, + "CompositionOnlyMatminerAll2023": CompositionOnlyMatminerAll2023Featurizer, } diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py index 14b0f3cf..86bcb765 100644 --- a/modnet/featurizers/presets/matminer_2023.py +++ b/modnet/featurizers/presets/matminer_2023.py @@ -228,4 +228,4 @@ def __init__(self): super().__init__() self.oxid_composition_featurizers = () self.structure_featurizers = () - self.site_featurizers = () + self.site_featurizers = () \ No newline at end of file diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py new file mode 100644 index 00000000..1b97b98a --- /dev/null +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -0,0 +1,341 @@ +""" This submodule contains the `Matminer2023Featurizer` class. """ + +import numpy as np +import modnet.featurizers +import contextlib + + +class MatminerAll2023Featurizer(modnet.featurizers.MODFeaturizer): + """A "kitchen-sink" featurizer for features implemented in matminer + at time of creation (matminer v0.8.0 from late 2022/early 2023). + + Follows the same philosophy and featurizer list as the `DeBreuck2020Featurizer` + but with many features changing their underlying matminer implementation, + definition and behaviour since the creation of the former featurizer. + + """ + + def __init__(self, fast_oxid: bool = False): + """Creates the featurizer and imports all featurizer functions. + + Parameters: + fast_oxid: Whether to use the accelerated oxidation state parameters within + pymatgen when constructing features that constrain oxidation states such + that all sites with the same species in a structure will have the same + oxidation state (recommended if featurizing any structure + with large unit cells). + + """ + + super().__init__() + self.fast_oxid = fast_oxid + self.load_featurizers() + + def load_featurizers(self): + with contextlib.redirect_stdout(None): + from pymatgen.analysis.local_env import VoronoiNN + from matminer.featurizers.composition import ( + AtomicOrbitals, + AtomicPackingEfficiency, + BandCenter, + CationProperty, + ElectronAffinity, + ElectronegativityDiff, + ElementFraction, + ElementProperty, + IonProperty, + Miedema, + OxidationStates, + Stoichiometry, + TMetalFraction, + ValenceOrbital, + WenAlloys, + ) + from matminer.featurizers.structure import ( + # BagofBonds, - This descriptor was not used in the paper preset + BondFractions, + ChemicalOrdering, + CoulombMatrix, + DensityFeatures, + EwaldEnergy, + GlobalSymmetryFeatures, + MaximumPackingEfficiency, + # PartialRadialDistributionFunction, + RadialDistributionFunction, + SineCoulombMatrix, + StructuralHeterogeneity, + XRDPowderPattern, + ) + + from matminer.featurizers.site import ( + AGNIFingerprints, + AverageBondAngle, + AverageBondLength, + BondOrientationalParameter, + ChemEnvSiteFingerprint, + CoordinationNumber, + CrystalNNFingerprint, + GaussianSymmFunc, + GeneralizedRadialDistributionFunction, + LocalPropertyDifference, + OPSiteFingerprint, + VoronoiFingerprint, + ) + + # Get additional ElementProperty featurizer, but + # get only the features that are not yet present with another featurizer. + # Also in the case of continuous features, use only the mean and avg_dev. + from matminer.utils.data import PymatgenData, DemlData + magpie_featurizer = ElementProperty.from_preset("magpie") + magpie_featurizer.stats = ["mean", "avg_dev"] + + pymatgen_features = [ + "block", + "mendeleev_no", + "electrical_resistivity", + "velocity_of_sound", + "thermal_conductivity", + "bulk_modulus", + "coefficient_of_linear_thermal_expansion", + ] + pymatgen_featurizer = ElementProperty( + data_source=PymatgenData(), + stats=["mean", "avg_dev"], + features=pymatgen_features, + ) + + deml_features = [ + "atom_radius", + "molar_vol", + "heat_fusion", + "boiling_point", + "heat_cap", + "first_ioniz", + "electric_pol", + "GGAU_Etot", + "mus_fere", + "FERE correction", + ] + deml_featurizer = ElementProperty( + data_source=DemlData(), + stats=["mean", "avg_dev"], + features=deml_features, + ) + + self.composition_continuous_featurizers = ( + BandCenter(), + ElementFraction(), + magpie_featurizer, + pymatgen_featurizer, + deml_featurizer, + Stoichiometry(p_list=[2, 3, 5, 7, 10]), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + WenAlloys(), + ) + + # Get back the initial presets from Matminer, without the duplicate features from Magpie + pymatgen_featurizer_full = ElementProperty( + data_source=PymatgenData(), + stats=["minimum", "maximum", "range", "mean", "std_dev"], + features=pymatgen_features, + ) + + deml_featurizer_full = ElementProperty( + data_source=DemlData(), + stats=["minimum", "maximum", "range", "mean", "std_dev"], + features=deml_features, + ) + + self.composition_featurizers = ( + AtomicOrbitals(), + AtomicPackingEfficiency(), + BandCenter(), + ElementFraction(), + ElementProperty.from_preset("magpie"), + pymatgen_featurizer_full, + deml_featurizer_full, + Miedema(), + Stoichiometry(), + TMetalFraction(), + ValenceOrbital(), + WenAlloys(), + ) + + self.oxid_composition_continuous_featurizers = ( + IonProperty(fast=self.fast_oxid), + OxidationStates(stats=["mean"]), + ) + + self.oxid_composition_featurizers = ( + CationProperty.from_preset("deml"), + ElectronAffinity(), + ElectronegativityDiff(), + IonProperty(fast=self.fast_oxid), + OxidationStates(), + ) + + self.structure_featurizers = ( + DensityFeatures(), + GlobalSymmetryFeatures(), + RadialDistributionFunction(), + CoulombMatrix(), + # PartialRadialDistributionFunction(), + SineCoulombMatrix(), + EwaldEnergy(), + BondFractions(), + StructuralHeterogeneity(), + MaximumPackingEfficiency(), + ChemicalOrdering(), + XRDPowderPattern(), + # BagofBonds(), + ) + + # Patch for matminer: see https://github.com/hackingmaterials/matminer/issues/864 + self.structure_featurizers[0].desired_features = None + self.structure_featurizers[1].desired_features = None + + self.site_featurizers = ( + AGNIFingerprints(), + AverageBondAngle(VoronoiNN()), + AverageBondLength(VoronoiNN()), + BondOrientationalParameter(), + ChemEnvSiteFingerprint.from_preset("simple"), + CoordinationNumber(), + CrystalNNFingerprint.from_preset("ops"), + GaussianSymmFunc(), + GeneralizedRadialDistributionFunction.from_preset("gaussian"), + LocalPropertyDifference(), + OPSiteFingerprint(), + VoronoiFingerprint(), + ) + + def featurize_composition(self, df): + """Applies the preset composition featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + from pymatgen.core.periodic_table import Element + + df = super().featurize_composition(df) + + if self.composition_featurizers: + _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} + df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( + _orbitals + ) + df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( + _orbitals + ) + + df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + + if self.composition_continuous_featurizers: + df.drop( + columns=[ + "WenAlloys|Yang omega", + "WenAlloys|Yang delta", + "WenAlloys|Radii gamma", + "WenAlloys|Lambda entropy", + "WenAlloys|APE mean", + "WenAlloys|Interant electrons", + "WenAlloys|Interant s electrons", + "WenAlloys|Interant p electrons", + "WenAlloys|Interant d electrons", + "WenAlloys|Interant f electrons", + "WenAlloys|Atomic weight mean", + "WenAlloys|Total weight", + ], + inplace=True + ) + + if self.oxid_composition_continuous_featurizers: + df.drop(columns=["IonProperty|max ionic char"], inplace=True) + + return modnet.featurizers.clean_df(df) + + def featurize_structure(self, df): + """Applies the preset structural featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + if self.structure_featurizers: + df = super().featurize_structure(df) + + _crystal_system = { + "cubic": 1, + "tetragonal": 2, + "orthorombic": 3, + "hexagonal": 4, + "trigonal": 5, + "monoclinic": 6, + "triclinic": 7, + } + + def _int_map(x): + if x == np.nan: + return 0 + elif x: + return 1 + else: + return 0 + + df["GlobalSymmetryFeatures|crystal_system"] = df[ + "GlobalSymmetryFeatures|crystal_system" + ].map(_crystal_system) + df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ + "GlobalSymmetryFeatures|is_centrosymmetric" + ].map(_int_map) + + return modnet.featurizers.clean_df(df) + + def featurize_site(self, df): + """Applies the preset site featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + # rename some features for backwards compatibility with pretrained models + aliases = { + "GeneralizedRadialDistributionFunction": "GeneralizedRDF", + "AGNIFingerprints": "AGNIFingerPrint", + "BondOrientationalParameter": "BondOrientationParameter", + } + df = super().featurize_site(df, aliases=aliases) + df = df.loc[:, (df != 0).any(axis=0)] + + return modnet.featurizers.clean_df(df) + + +class CompositionOnlyMatminerAll2023Featurizer(MatminerAll2023Featurizer): + """This subclass simply disables structure and site-level features + from the main `Matminer2023Featurizer` class. + + This should yield identical results to the original 2020 version. + + """ + + def __init__(self, continuous_only: bool = False, oxidation_featurizers: bool = False, fast_oxid: bool = False): + super().__init__(fast_oxid=fast_oxid) + self.fast_oxid = fast_oxid + self.structure_featurizers = () + self.site_featurizers = () + if continuous_only: + self.composition_featurizers = () + else: + self.composition_continuous_featurizers = () + + if oxidation_featurizers: + if continuous_only: + self.oxid_composition_featurizers = () + else: + self.oxid_composition_continuous_featurizers = () + else: + self.oxid_composition_featurizers = () + self.oxid_composition_continuous_featurizers = () diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index bdf3bb88..576dc427 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -117,9 +117,7 @@ def _mapArrayToInt(a): # Drop features which have the same value for the entire data set if drop_constant_features: - frange = df_feat.max(axis=0) - df_feat.min(axis=0) - to_drop = frange[frange == 0].index - df_feat = df_feat.drop(to_drop, axis=1) + df_feat = df_feat.loc[:, (df_feat != df_feat.iloc[0]).any()] # preprocess the input matrix if ( From f3c7488bb1aeaff079842300e6237e0bc5243b1d Mon Sep 17 00:00:00 2001 From: gbrunin Date: Thu, 23 Feb 2023 17:04:59 +0100 Subject: [PATCH 03/10] pre-commit --- modnet/featurizers/featurizers.py | 43 +++++++++++++------ modnet/featurizers/presets/matminer_2023.py | 2 +- .../featurizers/presets/matminer_all_2023.py | 22 ++++++---- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index ad8636c8..b52dbc57 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -84,7 +84,11 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: """ df_composition = pd.DataFrame([]) - if self.composition_featurizers or self.oxid_composition_featurizers or self.composition_continuous_featurizers: + if ( + self.composition_featurizers + or self.oxid_composition_featurizers + or self.composition_continuous_featurizers + ): df_composition = self.featurize_composition(df) df_structure = pd.DataFrame([]) @@ -205,17 +209,29 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: df = df.rename(columns={"Input Data": ""}) df.columns = df.columns.map("|".join).str.strip("|") - if self.oxid_composition_featurizers or self.oxid_composition_continuous_featurizers: + if ( + self.oxid_composition_featurizers + or self.oxid_composition_continuous_featurizers + ): LOG.info("Applying oxidation state featurizers...") # Get integer composition if some are not col_comp = "composition" - if not all(all(amt == int(amt) for amt in comp.values()) for comp in df["composition"].values): - LOG.info("There are non-integer compositions in the dataset, and featurizers that need them. " - "Computing...") + if not all( + all(amt == int(amt) for amt in comp.values()) + for comp in df["composition"].values + ): + LOG.info( + "There are non-integer compositions in the dataset, and featurizers that need them. " + "Computing..." + ) df["integer_composition"] = [ - Composition(comp.get_integer_formula_and_factor( - max_denominator=5 if getattr(self, "fast_oxid", False) else 100)[0] - ) + Composition( + comp.get_integer_formula_and_factor( + max_denominator=5 + if getattr(self, "fast_oxid", False) + else 100 + )[0] + ) for comp in df["composition"].values ] # df["integer_composition"] = df["composition"].apply( @@ -231,13 +247,14 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: ).featurize_dataframe(df, col_id=col_comp) else: df = CompositionToOxidComposition( - max_sites=-1 if self.oxid_composition_continuous_featurizers else None - ).featurize_dataframe( - df, col_id=col_comp, ignore_errors=True - ) + max_sites=-1 + if self.oxid_composition_continuous_featurizers + else None + ).featurize_dataframe(df, col_id=col_comp, ignore_errors=True) df = self._fit_apply_featurizers( df, - self.oxid_composition_featurizers or self.oxid_composition_continuous_featurizers, + self.oxid_composition_featurizers + or self.oxid_composition_continuous_featurizers, "composition_oxid", mode=self.featurizer_mode, ) diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py index 86bcb765..14b0f3cf 100644 --- a/modnet/featurizers/presets/matminer_2023.py +++ b/modnet/featurizers/presets/matminer_2023.py @@ -228,4 +228,4 @@ def __init__(self): super().__init__() self.oxid_composition_featurizers = () self.structure_featurizers = () - self.site_featurizers = () \ No newline at end of file + self.site_featurizers = () diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index 1b97b98a..8159d6eb 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -86,6 +86,7 @@ def load_featurizers(self): # get only the features that are not yet present with another featurizer. # Also in the case of continuous features, use only the mean and avg_dev. from matminer.utils.data import PymatgenData, DemlData + magpie_featurizer = ElementProperty.from_preset("magpie") magpie_featurizer.stats = ["mean", "avg_dev"] @@ -221,12 +222,12 @@ def featurize_composition(self, df): if self.composition_featurizers: _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} - df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( - _orbitals - ) - df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( - _orbitals - ) + df["AtomicOrbitals|HOMO_character"] = df[ + "AtomicOrbitals|HOMO_character" + ].map(_orbitals) + df["AtomicOrbitals|LUMO_character"] = df[ + "AtomicOrbitals|LUMO_character" + ].map(_orbitals) df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( lambda x: -1 if not isinstance(x, str) else Element(x).Z @@ -251,7 +252,7 @@ def featurize_composition(self, df): "WenAlloys|Atomic weight mean", "WenAlloys|Total weight", ], - inplace=True + inplace=True, ) if self.oxid_composition_continuous_featurizers: @@ -321,7 +322,12 @@ class CompositionOnlyMatminerAll2023Featurizer(MatminerAll2023Featurizer): """ - def __init__(self, continuous_only: bool = False, oxidation_featurizers: bool = False, fast_oxid: bool = False): + def __init__( + self, + continuous_only: bool = False, + oxidation_featurizers: bool = False, + fast_oxid: bool = False, + ): super().__init__(fast_oxid=fast_oxid) self.fast_oxid = fast_oxid self.structure_featurizers = () From a64d9e9d6128d6afe32580591a0f994f5fd058d1 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 24 Feb 2023 11:27:45 +0100 Subject: [PATCH 04/10] Added a continuous_only attribute to MatminerAll2023 --- modnet/featurizers/featurizers.py | 26 +-- .../featurizers/presets/matminer_all_2023.py | 183 ++++++++++-------- 2 files changed, 105 insertions(+), 104 deletions(-) diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index b52dbc57..2d9f4b93 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -41,9 +41,7 @@ class MODFeaturizer(abc.ABC): """ - composition_continuous_featurizers: Optional[Iterable[BaseFeaturizer]] = None composition_featurizers: Optional[Iterable[BaseFeaturizer]] = None - oxid_composition_continuous_featurizers: Optional[Iterable[BaseFeaturizer]] = None oxid_composition_featurizers: Optional[Iterable[BaseFeaturizer]] = None structure_featurizers: Optional[Iterable[BaseFeaturizer]] = None site_featurizers: Optional[Iterable[BaseFeaturizer]] = None @@ -84,11 +82,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: """ df_composition = pd.DataFrame([]) - if ( - self.composition_featurizers - or self.oxid_composition_featurizers - or self.composition_continuous_featurizers - ): + if self.composition_featurizers or self.oxid_composition_featurizers: df_composition = self.featurize_composition(df) df_structure = pd.DataFrame([]) @@ -196,23 +190,20 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: df = df.copy() - if self.composition_featurizers or self.composition_continuous_featurizers: + if self.composition_featurizers: LOG.info("Applying composition featurizers...") df["composition"] = df["structure"].apply(lambda s: s.composition) df = self._fit_apply_featurizers( df, - self.composition_featurizers or self.composition_continuous_featurizers, + self.composition_featurizers, "composition", mode=self.featurizer_mode, ) df = df.rename(columns={"Input Data": ""}) df.columns = df.columns.map("|".join).str.strip("|") - if ( - self.oxid_composition_featurizers - or self.oxid_composition_continuous_featurizers - ): + if self.oxid_composition_featurizers: LOG.info("Applying oxidation state featurizers...") # Get integer composition if some are not col_comp = "composition" @@ -227,7 +218,7 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: df["integer_composition"] = [ Composition( comp.get_integer_formula_and_factor( - max_denominator=5 + max_denominator=10 if getattr(self, "fast_oxid", False) else 100 )[0] @@ -247,14 +238,11 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: ).featurize_dataframe(df, col_id=col_comp) else: df = CompositionToOxidComposition( - max_sites=-1 - if self.oxid_composition_continuous_featurizers - else None + max_sites=-1 if getattr(self, "continuous_only", False) else None ).featurize_dataframe(df, col_id=col_comp, ignore_errors=True) df = self._fit_apply_featurizers( df, - self.oxid_composition_featurizers - or self.oxid_composition_continuous_featurizers, + self.oxid_composition_featurizers, "composition_oxid", mode=self.featurizer_mode, ) diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index 8159d6eb..b5d5d793 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -9,13 +9,14 @@ class MatminerAll2023Featurizer(modnet.featurizers.MODFeaturizer): """A "kitchen-sink" featurizer for features implemented in matminer at time of creation (matminer v0.8.0 from late 2022/early 2023). - Follows the same philosophy and featurizer list as the `DeBreuck2020Featurizer` + Follows the same philosophy as the `DeBreuck2020Featurizer` but with many features changing their underlying matminer implementation, definition and behaviour since the creation of the former featurizer. + The featurizer list has also been updated to include all the available featurizers. """ - def __init__(self, fast_oxid: bool = False): + def __init__(self, fast_oxid: bool = False, continuous_only: bool = False): """Creates the featurizer and imports all featurizer functions. Parameters: @@ -24,11 +25,15 @@ def __init__(self, fast_oxid: bool = False): that all sites with the same species in a structure will have the same oxidation state (recommended if featurizing any structure with large unit cells). + continuous_only: Whether to keep only the features that are continuous + with respect to the composition (only for composition featurizers). + Discontinuous features may lead to discontinuities in the model predictions. """ super().__init__() self.fast_oxid = fast_oxid + self.continuous_only = continuous_only self.load_featurizers() def load_featurizers(self): @@ -52,44 +57,56 @@ def load_featurizers(self): WenAlloys, ) from matminer.featurizers.structure import ( - # BagofBonds, - This descriptor was not used in the paper preset + # BagofBonds, BondFractions, ChemicalOrdering, CoulombMatrix, DensityFeatures, + # Dimensionality, + # ElectronicRadialDistributionFunction, EwaldEnergy, + # GlobalInstabilityIndex, # Still experimental? GlobalSymmetryFeatures, + # JarvisCFID, MaximumPackingEfficiency, + # MinimumRelativeDistances, + # OrbitalFieldMatrix, # PartialRadialDistributionFunction, RadialDistributionFunction, SineCoulombMatrix, + # SiteStatsFingerprint, + # StructuralComplexity, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, + # AngularFourierSeries, AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, + # ChemicalSRO, CoordinationNumber, CrystalNNFingerprint, + # EwaldSiteEnergy, GaussianSymmFunc, GeneralizedRadialDistributionFunction, + # IntersticeDistribution, LocalPropertyDifference, OPSiteFingerprint, + # SiteElementalProperty, + # SOAP, VoronoiFingerprint, ) # Get additional ElementProperty featurizer, but # get only the features that are not yet present with another featurizer. - # Also in the case of continuous features, use only the mean and avg_dev. + # For this reason, we cannot rely on the Matminer presets for those. + # Also in the case of continuous features, use only the mean and avg_dev for the statistics. from matminer.utils.data import PymatgenData, DemlData - magpie_featurizer = ElementProperty.from_preset("magpie") - magpie_featurizer.stats = ["mean", "avg_dev"] - pymatgen_features = [ "block", "mendeleev_no", @@ -99,11 +116,6 @@ def load_featurizers(self): "bulk_modulus", "coefficient_of_linear_thermal_expansion", ] - pymatgen_featurizer = ElementProperty( - data_source=PymatgenData(), - stats=["mean", "avg_dev"], - features=pymatgen_features, - ) deml_features = [ "atom_radius", @@ -117,64 +129,76 @@ def load_featurizers(self): "mus_fere", "FERE correction", ] - deml_featurizer = ElementProperty( - data_source=DemlData(), - stats=["mean", "avg_dev"], - features=deml_features, - ) - - self.composition_continuous_featurizers = ( - BandCenter(), - ElementFraction(), - magpie_featurizer, - pymatgen_featurizer, - deml_featurizer, - Stoichiometry(p_list=[2, 3, 5, 7, 10]), - TMetalFraction(), - ValenceOrbital(props=["frac"]), - WenAlloys(), - ) - - # Get back the initial presets from Matminer, without the duplicate features from Magpie - pymatgen_featurizer_full = ElementProperty( - data_source=PymatgenData(), - stats=["minimum", "maximum", "range", "mean", "std_dev"], - features=pymatgen_features, - ) - deml_featurizer_full = ElementProperty( - data_source=DemlData(), - stats=["minimum", "maximum", "range", "mean", "std_dev"], - features=deml_features, - ) + if self.continuous_only: + magpie_featurizer = ElementProperty.from_preset("magpie") + magpie_featurizer.stats = ["mean", "avg_dev"] + + pymatgen_featurizer = ElementProperty( + data_source=PymatgenData(), + stats=["mean", "avg_dev"], + features=pymatgen_features, + ) + + deml_featurizer = ElementProperty( + data_source=DemlData(), + stats=["mean", "avg_dev"], + features=deml_features, + ) + + self.composition_featurizers = ( + BandCenter(), + ElementFraction(), + magpie_featurizer, + pymatgen_featurizer, + deml_featurizer, + Stoichiometry(p_list=[2, 3, 5, 7, 10]), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + WenAlloys(), + ) + + self.oxid_composition_featurizers = ( + IonProperty(fast=self.fast_oxid), + OxidationStates(stats=["mean"]), + ) - self.composition_featurizers = ( - AtomicOrbitals(), - AtomicPackingEfficiency(), - BandCenter(), - ElementFraction(), - ElementProperty.from_preset("magpie"), - pymatgen_featurizer_full, - deml_featurizer_full, - Miedema(), - Stoichiometry(), - TMetalFraction(), - ValenceOrbital(), - WenAlloys(), - ) - - self.oxid_composition_continuous_featurizers = ( - IonProperty(fast=self.fast_oxid), - OxidationStates(stats=["mean"]), - ) - - self.oxid_composition_featurizers = ( - CationProperty.from_preset("deml"), - ElectronAffinity(), - ElectronegativityDiff(), - IonProperty(fast=self.fast_oxid), - OxidationStates(), - ) + else: + # Get the initial presets from Matminer, without the duplicate features from Magpie + pymatgen_featurizer_full = ElementProperty( + data_source=PymatgenData(), + stats=["minimum", "maximum", "range", "mean", "std_dev"], + features=pymatgen_features, + ) + + deml_featurizer_full = ElementProperty( + data_source=DemlData(), + stats=["minimum", "maximum", "range", "mean", "std_dev"], + features=deml_features, + ) + + self.composition_featurizers = ( + AtomicOrbitals(), + AtomicPackingEfficiency(), + BandCenter(), + ElementFraction(), + ElementProperty.from_preset("magpie"), + pymatgen_featurizer_full, + deml_featurizer_full, + Miedema(), + Stoichiometry(), + TMetalFraction(), + ValenceOrbital(), + WenAlloys(), + ) + + self.oxid_composition_featurizers = ( + CationProperty.from_preset("deml"), + ElectronAffinity(), + ElectronegativityDiff(), + IonProperty(fast=self.fast_oxid), + OxidationStates(), + ) self.structure_featurizers = ( DensityFeatures(), @@ -220,7 +244,7 @@ def featurize_composition(self, df): df = super().featurize_composition(df) - if self.composition_featurizers: + if self.composition_featurizers and not self.continuous_only: _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} df["AtomicOrbitals|HOMO_character"] = df[ "AtomicOrbitals|HOMO_character" @@ -236,7 +260,7 @@ def featurize_composition(self, df): lambda x: -1 if not isinstance(x, str) else Element(x).Z ) - if self.composition_continuous_featurizers: + if self.continuous_only: df.drop( columns=[ "WenAlloys|Yang omega", @@ -255,8 +279,8 @@ def featurize_composition(self, df): inplace=True, ) - if self.oxid_composition_continuous_featurizers: - df.drop(columns=["IonProperty|max ionic char"], inplace=True) + if self.oxid_composition_featurizers: + df.drop(columns=["IonProperty|max ionic char"], inplace=True) return modnet.featurizers.clean_df(df) @@ -328,20 +352,9 @@ def __init__( oxidation_featurizers: bool = False, fast_oxid: bool = False, ): - super().__init__(fast_oxid=fast_oxid) + super().__init__(fast_oxid=fast_oxid, continuous_only=continuous_only) self.fast_oxid = fast_oxid self.structure_featurizers = () self.site_featurizers = () - if continuous_only: - self.composition_featurizers = () - else: - self.composition_continuous_featurizers = () - - if oxidation_featurizers: - if continuous_only: - self.oxid_composition_featurizers = () - else: - self.oxid_composition_continuous_featurizers = () - else: + if not oxidation_featurizers: self.oxid_composition_featurizers = () - self.oxid_composition_continuous_featurizers = () From a7a53aac1097decdfa863ab2e47c648f657f9b1a Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 24 Feb 2023 14:43:42 +0100 Subject: [PATCH 05/10] Added many features from ElementProperty. Removed a few more discontinuous features. --- .../featurizers/presets/matminer_all_2023.py | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index b5d5d793..7a3bc845 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -105,7 +105,12 @@ def load_featurizers(self): # get only the features that are not yet present with another featurizer. # For this reason, we cannot rely on the Matminer presets for those. # Also in the case of continuous features, use only the mean and avg_dev for the statistics. - from matminer.utils.data import PymatgenData, DemlData + from matminer.utils.data import ( + PymatgenData, + DemlData, + MatscholarElementData, + MEGNetElementData, + ) pymatgen_features = [ "block", @@ -146,12 +151,26 @@ def load_featurizers(self): features=deml_features, ) + matscholar_featurizer = ElementProperty( + data_source=MatscholarElementData(), + stats=["mean", "avg_dev"], + features=MatscholarElementData().prop_names, + ) + + megnet_featurizer = ElementProperty( + data_source=MEGNetElementData(), + stats=["mean", "avg_dev"], + features=MEGNetElementData().prop_names, + ) + self.composition_featurizers = ( BandCenter(), ElementFraction(), magpie_featurizer, pymatgen_featurizer, deml_featurizer, + matscholar_featurizer, + megnet_featurizer, Stoichiometry(p_list=[2, 3, 5, 7, 10]), TMetalFraction(), ValenceOrbital(props=["frac"]), @@ -185,10 +204,12 @@ def load_featurizers(self): ElementProperty.from_preset("magpie"), pymatgen_featurizer_full, deml_featurizer_full, + ElementProperty.from_preset("matscholar_el"), + ElementProperty.from_preset("megnet_el"), Miedema(), Stoichiometry(), TMetalFraction(), - ValenceOrbital(), + ValenceOrbital(props=["frac"]), WenAlloys(), ) @@ -261,6 +282,8 @@ def featurize_composition(self, df): ) if self.continuous_only: + # These are additional features that have shown discontinuities in my tests. + # Hopefully, I got them all... df.drop( columns=[ "WenAlloys|Yang omega", @@ -275,6 +298,11 @@ def featurize_composition(self, df): "WenAlloys|Interant f electrons", "WenAlloys|Atomic weight mean", "WenAlloys|Total weight", + "ElementProperty|DemlData mean electric_pol", + "ElementProperty|DemlData mean FERE correction", + "ElementProperty|DemlData mean GGAU_Etot", + "ElementProperty|DemlData mean heat_fusion", + "ElementProperty|DemlData mean mus_fere", ], inplace=True, ) From 6a65c63314a958ab57233db952f51b635890e4b1 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Mon, 27 Feb 2023 14:51:44 +0100 Subject: [PATCH 06/10] Added structure featurizers. --- modnet/featurizers/featurizers.py | 4 +- .../featurizers/presets/matminer_all_2023.py | 63 +++++++++++-------- modnet/preprocessing.py | 7 +++ 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 2d9f4b93..4422c570 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -299,6 +299,9 @@ def featurize_site( df.columns = ["Input data|" + x for x in df.columns] for fingerprint in self.site_featurizers: + fingerprint_name = fingerprint.__class__.__name__ + if fingerprint_name == "SOAP": + fingerprint.fit(df["Input data|structure"]) site_stats_fingerprint = SiteStatsFingerprint( fingerprint, stats=self.site_stats ) @@ -306,7 +309,6 @@ def featurize_site( df, "Input data|structure", multiindex=False, ignore_errors=True ) - fingerprint_name = fingerprint.__class__.__name__ if aliases: fingerprint_name = aliases.get(fingerprint_name, fingerprint_name) if "|" not in fingerprint_name: diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index 7a3bc845..a3d3a590 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -49,55 +49,57 @@ def load_featurizers(self): ElementFraction, ElementProperty, IonProperty, + # Meredig, # Included in others Miedema, OxidationStates, Stoichiometry, TMetalFraction, ValenceOrbital, WenAlloys, + # YangSolidSolution, # Included in WenAlloys ) from matminer.featurizers.structure import ( - # BagofBonds, + # BagofBonds, # Leads to >24 000 features BondFractions, ChemicalOrdering, - CoulombMatrix, + # CoulombMatrix, # Redundant with SineCoulombMatrix, which is better for periodic systems DensityFeatures, - # Dimensionality, - # ElectronicRadialDistributionFunction, + Dimensionality, + ElectronicRadialDistributionFunction, EwaldEnergy, # GlobalInstabilityIndex, # Still experimental? GlobalSymmetryFeatures, - # JarvisCFID, + JarvisCFID, MaximumPackingEfficiency, - # MinimumRelativeDistances, - # OrbitalFieldMatrix, - # PartialRadialDistributionFunction, + MinimumRelativeDistances, + # OrbitalFieldMatrix, # Buggy + # PartialRadialDistributionFunction, # Leads to >198 000 features RadialDistributionFunction, SineCoulombMatrix, - # SiteStatsFingerprint, - # StructuralComplexity, + # SiteStatsFingerprint, # Done in featurizers.py + StructuralComplexity, StructuralHeterogeneity, XRDPowderPattern, ) from matminer.featurizers.site import ( AGNIFingerprints, - # AngularFourierSeries, + # AngularFourierSeries, # Redundant with GaussianSymmFunc AverageBondAngle, AverageBondLength, BondOrientationalParameter, ChemEnvSiteFingerprint, - # ChemicalSRO, + # ChemicalSRO, # Buggy CoordinationNumber, CrystalNNFingerprint, - # EwaldSiteEnergy, + EwaldSiteEnergy, GaussianSymmFunc, GeneralizedRadialDistributionFunction, - # IntersticeDistribution, + IntersticeDistribution, LocalPropertyDifference, OPSiteFingerprint, - # SiteElementalProperty, - # SOAP, + # SiteElementalProperty, # Already included in composition featurizers + # SOAP, # Leads to >260 000 features... VoronoiFingerprint, ) @@ -222,37 +224,48 @@ def load_featurizers(self): ) self.structure_featurizers = ( + # BagofBonds(), # > 24 000 features + BondFractions(), + ChemicalOrdering(), + # CoulombMatrix(), # Redundant with SineCoulombMatrix, which is better for periodic systems DensityFeatures(), + Dimensionality(), + ElectronicRadialDistributionFunction(), + EwaldEnergy(), GlobalSymmetryFeatures(), + JarvisCFID(), # 1557 features, many redundant ones + MaximumPackingEfficiency(), + MinimumRelativeDistances(), + # OrbitalFieldMatrix(), # Buggy + # PartialRadialDistributionFunction(), # > 198 000 features RadialDistributionFunction(), - CoulombMatrix(), - # PartialRadialDistributionFunction(), SineCoulombMatrix(), - EwaldEnergy(), - BondFractions(), + StructuralComplexity(), StructuralHeterogeneity(), - MaximumPackingEfficiency(), - ChemicalOrdering(), XRDPowderPattern(), - # BagofBonds(), ) # Patch for matminer: see https://github.com/hackingmaterials/matminer/issues/864 - self.structure_featurizers[0].desired_features = None - self.structure_featurizers[1].desired_features = None + self.structure_featurizers[2].desired_features = None + self.structure_featurizers[6].desired_features = None self.site_featurizers = ( AGNIFingerprints(), + # AngularFourierSeries.from_preset("gaussian"), # Redundant with GaussianSymmFunc AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), + # ChemicalSRO.from_preset("VoronoiNN"), # Buggy CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), + EwaldSiteEnergy(), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), + IntersticeDistribution(), LocalPropertyDifference(), OPSiteFingerprint(), + # SOAP.from_preset("formation_energy"), # Leads to >260 000 features... VoronoiFingerprint(), ) diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 576dc427..c5a5ba31 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -70,6 +70,7 @@ def nmi_target( df_target: pd.DataFrame, task_type: str = "regression", drop_constant_features: bool = True, + drop_duplicate_features: bool = True, **kwargs, ) -> pd.DataFrame: """ @@ -85,6 +86,8 @@ def nmi_target( task_type (integer): 0 for regression, 1 for classification drop_constant_features (bool): If True, the features that are constant across the entire data set will be dropped. + drop_duplicate_features (bool): If True, the features that have exactly the same + values across the entire data set will be dropped. **kwargs: Keyword arguments to be passed down to the :py:func:`mutual_info_regression` function from scikit-learn. This can be useful e.g. for testing purposes. @@ -115,6 +118,10 @@ def _mapArrayToInt(a): "should contain the same number of data points." ) + # Drop features that are duplicates across the entire data set + if drop_duplicate_features: + df_feat = df_feat.T.drop_duplicates().T + # Drop features which have the same value for the entire data set if drop_constant_features: df_feat = df_feat.loc[:, (df_feat != df_feat.iloc[0]).any()] From d33ae9a1e80dcec041d44f907650cc2c3d460e70 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Mon, 27 Feb 2023 15:03:28 +0100 Subject: [PATCH 07/10] Linting --- modnet/featurizers/presets/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modnet/featurizers/presets/__init__.py b/modnet/featurizers/presets/__init__.py index f39fa070..1191aec0 100644 --- a/modnet/featurizers/presets/__init__.py +++ b/modnet/featurizers/presets/__init__.py @@ -7,7 +7,10 @@ from typing import Dict, Type from .debreuck_2020 import DeBreuck2020Featurizer, CompositionOnlyFeaturizer from .matminer_2023 import Matminer2023Featurizer, CompositionOnlyMatminer2023Featurizer -from .matminer_all_2023 import MatminerAll2023Featurizer, CompositionOnlyMatminerAll2023Featurizer +from .matminer_all_2023 import ( + MatminerAll2023Featurizer, + CompositionOnlyMatminerAll2023Featurizer, +) from modnet.featurizers import MODFeaturizer DEFAULT_FEATURIZER: str = "Matminer2023" From 24a995a7c63fca8956a884bbad63ac59c103915a Mon Sep 17 00:00:00 2001 From: gbrunin Date: Tue, 28 Feb 2023 17:00:55 +0100 Subject: [PATCH 08/10] Added possibility to use continuous features only in the initial preset --- modnet/featurizers/presets/matminer_2023.py | 49 ++++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py index 14b0f3cf..80f74042 100644 --- a/modnet/featurizers/presets/matminer_2023.py +++ b/modnet/featurizers/presets/matminer_2023.py @@ -15,7 +15,7 @@ class Matminer2023Featurizer(modnet.featurizers.MODFeaturizer): """ - def __init__(self, fast_oxid: bool = False): + def __init__(self, fast_oxid: bool = False, continuous_only: bool = False): """Creates the featurizer and imports all featurizer functions. Parameters: @@ -28,8 +28,9 @@ def __init__(self, fast_oxid: bool = False): """ super().__init__() - self.load_featurizers() + self.continuous_only = continuous_only self.fast_oxid = fast_oxid + self.load_featurizers() def load_featurizers(self): with contextlib.redirect_stdout(None): @@ -82,19 +83,33 @@ def load_featurizers(self): VoronoiFingerprint, ) - self.composition_featurizers = ( - AtomicOrbitals(), - AtomicPackingEfficiency(), - BandCenter(), - ElementFraction(), - ElementProperty.from_preset("magpie"), - IonProperty(), - Miedema(), - Stoichiometry(), - TMetalFraction(), - ValenceOrbital(), - YangSolidSolution(), - ) + if self.continuous_only: + magpie_featurizer = ElementProperty.from_preset("magpie") + magpie_featurizer.stats = ["mean", "avg_dev"] + + self.composition_featurizers = ( + BandCenter(), + ElementFraction(), + magpie_featurizer, + IonProperty(fast=self.fast_oxid), + Stoichiometry(p_list=[2, 3, 5, 7, 10]), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + ) + else: + self.composition_featurizers = ( + AtomicOrbitals(), + AtomicPackingEfficiency(), + BandCenter(), + ElementFraction(), + ElementProperty.from_preset("magpie"), + IonProperty(), + Miedema(), + Stoichiometry(), + TMetalFraction(), + ValenceOrbital(), + YangSolidSolution(), + ) self.oxid_composition_featurizers = ( ElectronegativityDiff(), @@ -224,8 +239,8 @@ class CompositionOnlyMatminer2023Featurizer(Matminer2023Featurizer): """ - def __init__(self): - super().__init__() + def __init__(self, continuous_only: bool = False, fast_oxid: bool = False): + super().__init__(fast_oxid=fast_oxid, continuous_only=continuous_only) self.oxid_composition_featurizers = () self.structure_featurizers = () self.site_featurizers = () From 4f117bf0e4285b93d3b5173dc6e22b650e8fff73 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Mon, 6 Mar 2023 08:17:01 +0100 Subject: [PATCH 09/10] Bug fix when using the initial preset with only continuous features. --- modnet/featurizers/presets/matminer_2023.py | 32 ++++++++++++--------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py index 80f74042..0d67a2a6 100644 --- a/modnet/featurizers/presets/matminer_2023.py +++ b/modnet/featurizers/presets/matminer_2023.py @@ -160,20 +160,24 @@ def featurize_composition(self, df): df = super().featurize_composition(df) - _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} - df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( - _orbitals - ) - df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( - _orbitals - ) - - df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( - lambda x: -1 if not isinstance(x, str) else Element(x).Z - ) - df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( - lambda x: -1 if not isinstance(x, str) else Element(x).Z - ) + if not self.continuous_only: + _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} + df["AtomicOrbitals|HOMO_character"] = df[ + "AtomicOrbitals|HOMO_character" + ].map(_orbitals) + df["AtomicOrbitals|LUMO_character"] = df[ + "AtomicOrbitals|LUMO_character" + ].map(_orbitals) + + df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + + else: + df.drop(columns=["IonProperty|max ionic char"], inplace=True) return modnet.featurizers.clean_df(df) From c19274b46a3aac4077b82a26b0c4f6366e30acb5 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Tue, 9 May 2023 17:11:14 +0200 Subject: [PATCH 10/10] Removed 2 featurizers that give redundant features that are difficult to make sense of. --- .../featurizers/presets/matminer_all_2023.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index a3d3a590..2e2a4e4b 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -110,8 +110,8 @@ def load_featurizers(self): from matminer.utils.data import ( PymatgenData, DemlData, - MatscholarElementData, - MEGNetElementData, + # MatscholarElementData, + # MEGNetElementData, ) pymatgen_features = [ @@ -153,17 +153,17 @@ def load_featurizers(self): features=deml_features, ) - matscholar_featurizer = ElementProperty( - data_source=MatscholarElementData(), - stats=["mean", "avg_dev"], - features=MatscholarElementData().prop_names, - ) - - megnet_featurizer = ElementProperty( - data_source=MEGNetElementData(), - stats=["mean", "avg_dev"], - features=MEGNetElementData().prop_names, - ) + # matscholar_featurizer = ElementProperty( + # data_source=MatscholarElementData(), + # stats=["mean", "avg_dev"], + # features=MatscholarElementData().prop_names, + # ) + # + # megnet_featurizer = ElementProperty( + # data_source=MEGNetElementData(), + # stats=["mean", "avg_dev"], + # features=MEGNetElementData().prop_names, + # ) self.composition_featurizers = ( BandCenter(), @@ -171,8 +171,8 @@ def load_featurizers(self): magpie_featurizer, pymatgen_featurizer, deml_featurizer, - matscholar_featurizer, - megnet_featurizer, + # matscholar_featurizer, + # megnet_featurizer, Stoichiometry(p_list=[2, 3, 5, 7, 10]), TMetalFraction(), ValenceOrbital(props=["frac"]), @@ -206,8 +206,8 @@ def load_featurizers(self): ElementProperty.from_preset("magpie"), pymatgen_featurizer_full, deml_featurizer_full, - ElementProperty.from_preset("matscholar_el"), - ElementProperty.from_preset("megnet_el"), + # ElementProperty.from_preset("matscholar_el"), + # ElementProperty.from_preset("megnet_el"), Miedema(), Stoichiometry(), TMetalFraction(),