diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index ca8b20c7..4422c570 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -2,6 +2,7 @@ from typing import Optional, Iterable, Tuple, Dict import pandas as pd +from pymatgen.core import Composition from matminer.featurizers.base import MultipleFeaturizer, BaseFeaturizer from matminer.featurizers.structure import SiteStatsFingerprint @@ -204,14 +205,41 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: if self.oxid_composition_featurizers: LOG.info("Applying oxidation state featurizers...") + # Get integer composition if some are not + col_comp = "composition" + if not all( + all(amt == int(amt) for amt in comp.values()) + for comp in df["composition"].values + ): + LOG.info( + "There are non-integer compositions in the dataset, and featurizers that need them. " + "Computing..." + ) + df["integer_composition"] = [ + Composition( + comp.get_integer_formula_and_factor( + max_denominator=10 + if getattr(self, "fast_oxid", False) + else 100 + )[0] + ) + for comp in df["composition"].values + ] + # df["integer_composition"] = df["composition"].apply( + # lambda c: c.get_integer_formula_and_factor( + # max_denominator=10 if getattr(self, "fast_oxid", False) else 100 + # )[0] + # ) + + col_comp = "integer_composition" if getattr(self, "fast_oxid", False): df = CompositionToOxidComposition( all_oxi_states=False, max_sites=-1 - ).featurize_dataframe(df, "composition") + ).featurize_dataframe(df, col_id=col_comp) else: - df = CompositionToOxidComposition().featurize_dataframe( - df, "composition" - ) + df = CompositionToOxidComposition( + max_sites=-1 if getattr(self, "continuous_only", False) else None + ).featurize_dataframe(df, col_id=col_comp, ignore_errors=True) df = self._fit_apply_featurizers( df, self.oxid_composition_featurizers, @@ -271,6 +299,9 @@ def featurize_site( df.columns = ["Input data|" + x for x in df.columns] for fingerprint in self.site_featurizers: + fingerprint_name = fingerprint.__class__.__name__ + if fingerprint_name == "SOAP": + fingerprint.fit(df["Input data|structure"]) site_stats_fingerprint = SiteStatsFingerprint( fingerprint, stats=self.site_stats ) @@ -278,7 +309,6 @@ def featurize_site( df, "Input data|structure", multiindex=False, ignore_errors=True ) - fingerprint_name = fingerprint.__class__.__name__ if aliases: fingerprint_name = aliases.get(fingerprint_name, fingerprint_name) if "|" not in fingerprint_name: diff --git a/modnet/featurizers/presets/__init__.py b/modnet/featurizers/presets/__init__.py index f1417fb7..1191aec0 100644 --- a/modnet/featurizers/presets/__init__.py +++ b/modnet/featurizers/presets/__init__.py @@ -7,6 +7,10 @@ from typing import Dict, Type from .debreuck_2020 import DeBreuck2020Featurizer, CompositionOnlyFeaturizer from .matminer_2023 import Matminer2023Featurizer, CompositionOnlyMatminer2023Featurizer +from .matminer_all_2023 import ( + MatminerAll2023Featurizer, + CompositionOnlyMatminerAll2023Featurizer, +) from modnet.featurizers import MODFeaturizer DEFAULT_FEATURIZER: str = "Matminer2023" @@ -16,5 +20,7 @@ "DeBreuck2020": DeBreuck2020Featurizer, "CompositionOnly": CompositionOnlyFeaturizer, "Matminer2023": Matminer2023Featurizer, + "MatminerAll2023": MatminerAll2023Featurizer, "CompositionOnlyMatminer2023": CompositionOnlyMatminer2023Featurizer, + "CompositionOnlyMatminerAll2023": CompositionOnlyMatminerAll2023Featurizer, } diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py index 14b0f3cf..0d67a2a6 100644 --- a/modnet/featurizers/presets/matminer_2023.py +++ b/modnet/featurizers/presets/matminer_2023.py @@ -15,7 +15,7 @@ class Matminer2023Featurizer(modnet.featurizers.MODFeaturizer): """ - def __init__(self, fast_oxid: bool = False): + def __init__(self, fast_oxid: bool = False, continuous_only: bool = False): """Creates the featurizer and imports all featurizer functions. Parameters: @@ -28,8 +28,9 @@ def __init__(self, fast_oxid: bool = False): """ super().__init__() - self.load_featurizers() + self.continuous_only = continuous_only self.fast_oxid = fast_oxid + self.load_featurizers() def load_featurizers(self): with contextlib.redirect_stdout(None): @@ -82,19 +83,33 @@ def load_featurizers(self): VoronoiFingerprint, ) - self.composition_featurizers = ( - AtomicOrbitals(), - AtomicPackingEfficiency(), - BandCenter(), - ElementFraction(), - ElementProperty.from_preset("magpie"), - IonProperty(), - Miedema(), - Stoichiometry(), - TMetalFraction(), - ValenceOrbital(), - YangSolidSolution(), - ) + if self.continuous_only: + magpie_featurizer = ElementProperty.from_preset("magpie") + magpie_featurizer.stats = ["mean", "avg_dev"] + + self.composition_featurizers = ( + BandCenter(), + ElementFraction(), + magpie_featurizer, + IonProperty(fast=self.fast_oxid), + Stoichiometry(p_list=[2, 3, 5, 7, 10]), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + ) + else: + self.composition_featurizers = ( + AtomicOrbitals(), + AtomicPackingEfficiency(), + BandCenter(), + ElementFraction(), + ElementProperty.from_preset("magpie"), + IonProperty(), + Miedema(), + Stoichiometry(), + TMetalFraction(), + ValenceOrbital(), + YangSolidSolution(), + ) self.oxid_composition_featurizers = ( ElectronegativityDiff(), @@ -145,20 +160,24 @@ def featurize_composition(self, df): df = super().featurize_composition(df) - _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} - df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map( - _orbitals - ) - df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map( - _orbitals - ) - - df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( - lambda x: -1 if not isinstance(x, str) else Element(x).Z - ) - df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( - lambda x: -1 if not isinstance(x, str) else Element(x).Z - ) + if not self.continuous_only: + _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} + df["AtomicOrbitals|HOMO_character"] = df[ + "AtomicOrbitals|HOMO_character" + ].map(_orbitals) + df["AtomicOrbitals|LUMO_character"] = df[ + "AtomicOrbitals|LUMO_character" + ].map(_orbitals) + + df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + + else: + df.drop(columns=["IonProperty|max ionic char"], inplace=True) return modnet.featurizers.clean_df(df) @@ -224,8 +243,8 @@ class CompositionOnlyMatminer2023Featurizer(Matminer2023Featurizer): """ - def __init__(self): - super().__init__() + def __init__(self, continuous_only: bool = False, fast_oxid: bool = False): + super().__init__(fast_oxid=fast_oxid, continuous_only=continuous_only) self.oxid_composition_featurizers = () self.structure_featurizers = () self.site_featurizers = () diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py new file mode 100644 index 00000000..2e2a4e4b --- /dev/null +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -0,0 +1,401 @@ +""" This submodule contains the `Matminer2023Featurizer` class. """ + +import numpy as np +import modnet.featurizers +import contextlib + + +class MatminerAll2023Featurizer(modnet.featurizers.MODFeaturizer): + """A "kitchen-sink" featurizer for features implemented in matminer + at time of creation (matminer v0.8.0 from late 2022/early 2023). + + Follows the same philosophy as the `DeBreuck2020Featurizer` + but with many features changing their underlying matminer implementation, + definition and behaviour since the creation of the former featurizer. + The featurizer list has also been updated to include all the available featurizers. + + """ + + def __init__(self, fast_oxid: bool = False, continuous_only: bool = False): + """Creates the featurizer and imports all featurizer functions. + + Parameters: + fast_oxid: Whether to use the accelerated oxidation state parameters within + pymatgen when constructing features that constrain oxidation states such + that all sites with the same species in a structure will have the same + oxidation state (recommended if featurizing any structure + with large unit cells). + continuous_only: Whether to keep only the features that are continuous + with respect to the composition (only for composition featurizers). + Discontinuous features may lead to discontinuities in the model predictions. + + """ + + super().__init__() + self.fast_oxid = fast_oxid + self.continuous_only = continuous_only + self.load_featurizers() + + def load_featurizers(self): + with contextlib.redirect_stdout(None): + from pymatgen.analysis.local_env import VoronoiNN + from matminer.featurizers.composition import ( + AtomicOrbitals, + AtomicPackingEfficiency, + BandCenter, + CationProperty, + ElectronAffinity, + ElectronegativityDiff, + ElementFraction, + ElementProperty, + IonProperty, + # Meredig, # Included in others + Miedema, + OxidationStates, + Stoichiometry, + TMetalFraction, + ValenceOrbital, + WenAlloys, + # YangSolidSolution, # Included in WenAlloys + ) + from matminer.featurizers.structure import ( + # BagofBonds, # Leads to >24 000 features + BondFractions, + ChemicalOrdering, + # CoulombMatrix, # Redundant with SineCoulombMatrix, which is better for periodic systems + DensityFeatures, + Dimensionality, + ElectronicRadialDistributionFunction, + EwaldEnergy, + # GlobalInstabilityIndex, # Still experimental? + GlobalSymmetryFeatures, + JarvisCFID, + MaximumPackingEfficiency, + MinimumRelativeDistances, + # OrbitalFieldMatrix, # Buggy + # PartialRadialDistributionFunction, # Leads to >198 000 features + RadialDistributionFunction, + SineCoulombMatrix, + # SiteStatsFingerprint, # Done in featurizers.py + StructuralComplexity, + StructuralHeterogeneity, + XRDPowderPattern, + ) + + from matminer.featurizers.site import ( + AGNIFingerprints, + # AngularFourierSeries, # Redundant with GaussianSymmFunc + AverageBondAngle, + AverageBondLength, + BondOrientationalParameter, + ChemEnvSiteFingerprint, + # ChemicalSRO, # Buggy + CoordinationNumber, + CrystalNNFingerprint, + EwaldSiteEnergy, + GaussianSymmFunc, + GeneralizedRadialDistributionFunction, + IntersticeDistribution, + LocalPropertyDifference, + OPSiteFingerprint, + # SiteElementalProperty, # Already included in composition featurizers + # SOAP, # Leads to >260 000 features... + VoronoiFingerprint, + ) + + # Get additional ElementProperty featurizer, but + # get only the features that are not yet present with another featurizer. + # For this reason, we cannot rely on the Matminer presets for those. + # Also in the case of continuous features, use only the mean and avg_dev for the statistics. + from matminer.utils.data import ( + PymatgenData, + DemlData, + # MatscholarElementData, + # MEGNetElementData, + ) + + pymatgen_features = [ + "block", + "mendeleev_no", + "electrical_resistivity", + "velocity_of_sound", + "thermal_conductivity", + "bulk_modulus", + "coefficient_of_linear_thermal_expansion", + ] + + deml_features = [ + "atom_radius", + "molar_vol", + "heat_fusion", + "boiling_point", + "heat_cap", + "first_ioniz", + "electric_pol", + "GGAU_Etot", + "mus_fere", + "FERE correction", + ] + + if self.continuous_only: + magpie_featurizer = ElementProperty.from_preset("magpie") + magpie_featurizer.stats = ["mean", "avg_dev"] + + pymatgen_featurizer = ElementProperty( + data_source=PymatgenData(), + stats=["mean", "avg_dev"], + features=pymatgen_features, + ) + + deml_featurizer = ElementProperty( + data_source=DemlData(), + stats=["mean", "avg_dev"], + features=deml_features, + ) + + # matscholar_featurizer = ElementProperty( + # data_source=MatscholarElementData(), + # stats=["mean", "avg_dev"], + # features=MatscholarElementData().prop_names, + # ) + # + # megnet_featurizer = ElementProperty( + # data_source=MEGNetElementData(), + # stats=["mean", "avg_dev"], + # features=MEGNetElementData().prop_names, + # ) + + self.composition_featurizers = ( + BandCenter(), + ElementFraction(), + magpie_featurizer, + pymatgen_featurizer, + deml_featurizer, + # matscholar_featurizer, + # megnet_featurizer, + Stoichiometry(p_list=[2, 3, 5, 7, 10]), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + WenAlloys(), + ) + + self.oxid_composition_featurizers = ( + IonProperty(fast=self.fast_oxid), + OxidationStates(stats=["mean"]), + ) + + else: + # Get the initial presets from Matminer, without the duplicate features from Magpie + pymatgen_featurizer_full = ElementProperty( + data_source=PymatgenData(), + stats=["minimum", "maximum", "range", "mean", "std_dev"], + features=pymatgen_features, + ) + + deml_featurizer_full = ElementProperty( + data_source=DemlData(), + stats=["minimum", "maximum", "range", "mean", "std_dev"], + features=deml_features, + ) + + self.composition_featurizers = ( + AtomicOrbitals(), + AtomicPackingEfficiency(), + BandCenter(), + ElementFraction(), + ElementProperty.from_preset("magpie"), + pymatgen_featurizer_full, + deml_featurizer_full, + # ElementProperty.from_preset("matscholar_el"), + # ElementProperty.from_preset("megnet_el"), + Miedema(), + Stoichiometry(), + TMetalFraction(), + ValenceOrbital(props=["frac"]), + WenAlloys(), + ) + + self.oxid_composition_featurizers = ( + CationProperty.from_preset("deml"), + ElectronAffinity(), + ElectronegativityDiff(), + IonProperty(fast=self.fast_oxid), + OxidationStates(), + ) + + self.structure_featurizers = ( + # BagofBonds(), # > 24 000 features + BondFractions(), + ChemicalOrdering(), + # CoulombMatrix(), # Redundant with SineCoulombMatrix, which is better for periodic systems + DensityFeatures(), + Dimensionality(), + ElectronicRadialDistributionFunction(), + EwaldEnergy(), + GlobalSymmetryFeatures(), + JarvisCFID(), # 1557 features, many redundant ones + MaximumPackingEfficiency(), + MinimumRelativeDistances(), + # OrbitalFieldMatrix(), # Buggy + # PartialRadialDistributionFunction(), # > 198 000 features + RadialDistributionFunction(), + SineCoulombMatrix(), + StructuralComplexity(), + StructuralHeterogeneity(), + XRDPowderPattern(), + ) + + # Patch for matminer: see https://github.com/hackingmaterials/matminer/issues/864 + self.structure_featurizers[2].desired_features = None + self.structure_featurizers[6].desired_features = None + + self.site_featurizers = ( + AGNIFingerprints(), + # AngularFourierSeries.from_preset("gaussian"), # Redundant with GaussianSymmFunc + AverageBondAngle(VoronoiNN()), + AverageBondLength(VoronoiNN()), + BondOrientationalParameter(), + ChemEnvSiteFingerprint.from_preset("simple"), + # ChemicalSRO.from_preset("VoronoiNN"), # Buggy + CoordinationNumber(), + CrystalNNFingerprint.from_preset("ops"), + EwaldSiteEnergy(), + GaussianSymmFunc(), + GeneralizedRadialDistributionFunction.from_preset("gaussian"), + IntersticeDistribution(), + LocalPropertyDifference(), + OPSiteFingerprint(), + # SOAP.from_preset("formation_energy"), # Leads to >260 000 features... + VoronoiFingerprint(), + ) + + def featurize_composition(self, df): + """Applies the preset composition featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + from pymatgen.core.periodic_table import Element + + df = super().featurize_composition(df) + + if self.composition_featurizers and not self.continuous_only: + _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4} + df["AtomicOrbitals|HOMO_character"] = df[ + "AtomicOrbitals|HOMO_character" + ].map(_orbitals) + df["AtomicOrbitals|LUMO_character"] = df[ + "AtomicOrbitals|LUMO_character" + ].map(_orbitals) + + df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply( + lambda x: -1 if not isinstance(x, str) else Element(x).Z + ) + + if self.continuous_only: + # These are additional features that have shown discontinuities in my tests. + # Hopefully, I got them all... + df.drop( + columns=[ + "WenAlloys|Yang omega", + "WenAlloys|Yang delta", + "WenAlloys|Radii gamma", + "WenAlloys|Lambda entropy", + "WenAlloys|APE mean", + "WenAlloys|Interant electrons", + "WenAlloys|Interant s electrons", + "WenAlloys|Interant p electrons", + "WenAlloys|Interant d electrons", + "WenAlloys|Interant f electrons", + "WenAlloys|Atomic weight mean", + "WenAlloys|Total weight", + "ElementProperty|DemlData mean electric_pol", + "ElementProperty|DemlData mean FERE correction", + "ElementProperty|DemlData mean GGAU_Etot", + "ElementProperty|DemlData mean heat_fusion", + "ElementProperty|DemlData mean mus_fere", + ], + inplace=True, + ) + + if self.oxid_composition_featurizers: + df.drop(columns=["IonProperty|max ionic char"], inplace=True) + + return modnet.featurizers.clean_df(df) + + def featurize_structure(self, df): + """Applies the preset structural featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + if self.structure_featurizers: + df = super().featurize_structure(df) + + _crystal_system = { + "cubic": 1, + "tetragonal": 2, + "orthorombic": 3, + "hexagonal": 4, + "trigonal": 5, + "monoclinic": 6, + "triclinic": 7, + } + + def _int_map(x): + if x == np.nan: + return 0 + elif x: + return 1 + else: + return 0 + + df["GlobalSymmetryFeatures|crystal_system"] = df[ + "GlobalSymmetryFeatures|crystal_system" + ].map(_crystal_system) + df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[ + "GlobalSymmetryFeatures|is_centrosymmetric" + ].map(_int_map) + + return modnet.featurizers.clean_df(df) + + def featurize_site(self, df): + """Applies the preset site featurizers to the input dataframe, + renames some fields and cleans the output dataframe. + + """ + + # rename some features for backwards compatibility with pretrained models + aliases = { + "GeneralizedRadialDistributionFunction": "GeneralizedRDF", + "AGNIFingerprints": "AGNIFingerPrint", + "BondOrientationalParameter": "BondOrientationParameter", + } + df = super().featurize_site(df, aliases=aliases) + df = df.loc[:, (df != 0).any(axis=0)] + + return modnet.featurizers.clean_df(df) + + +class CompositionOnlyMatminerAll2023Featurizer(MatminerAll2023Featurizer): + """This subclass simply disables structure and site-level features + from the main `Matminer2023Featurizer` class. + + This should yield identical results to the original 2020 version. + + """ + + def __init__( + self, + continuous_only: bool = False, + oxidation_featurizers: bool = False, + fast_oxid: bool = False, + ): + super().__init__(fast_oxid=fast_oxid, continuous_only=continuous_only) + self.fast_oxid = fast_oxid + self.structure_featurizers = () + self.site_featurizers = () + if not oxidation_featurizers: + self.oxid_composition_featurizers = () diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 20e73d09..d8f5e2e7 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -70,6 +70,7 @@ def nmi_target( df_target: pd.DataFrame, task_type: str = "regression", drop_constant_features: bool = True, + drop_duplicate_features: bool = True, **kwargs, ) -> pd.DataFrame: """ @@ -85,6 +86,8 @@ def nmi_target( task_type (integer): 0 for regression, 1 for classification drop_constant_features (bool): If True, the features that are constant across the entire data set will be dropped. + drop_duplicate_features (bool): If True, the features that have exactly the same + values across the entire data set will be dropped. **kwargs: Keyword arguments to be passed down to the :py:func:`mutual_info_regression` function from scikit-learn. This can be useful e.g. for testing purposes. @@ -115,11 +118,13 @@ def _mapArrayToInt(a): "should contain the same number of data points." ) + # Drop features that are duplicates across the entire data set + if drop_duplicate_features: + df_feat = df_feat.T.drop_duplicates().T + # Drop features which have the same value for the entire data set if drop_constant_features: - frange = df_feat.max(axis=0) - df_feat.min(axis=0) - to_drop = frange[frange == 0].index - df_feat = df_feat.drop(to_drop, axis=1) + df_feat = df_feat.loc[:, (df_feat != df_feat.iloc[0]).any()] # preprocess the input matrix if (