Skip to content

Problem combining dataframes from different classes of featurizers #252

@rogeriog

Description

@rogeriog

Regarding this code in featurizers.py:

def featurize(self, df: pd.DataFrame) -> pd.DataFrame:
        """Run all of the preset featurizers on the input dataframe.

        Arguments:
            df: the input dataframe with a `"structure"` column
                containing pymatgen `Structure` objects.

        Returns:
            The featurized DataFrame.

        """
        df_composition = pd.DataFrame([])
        if self.composition_featurizers or self.oxid_composition_featurizers:
            df_composition = self.featurize_composition(df)

        df_structure = pd.DataFrame([])
        if self.structure_featurizers:
            df_structure = self.featurize_structure(df)

        df_site = pd.DataFrame([])
        if self.site_featurizers:
            df_site = self.featurize_site(df)

        return df_composition.join(df_structure.join(df_site, lsuffix="l"), rsuffix="r")

When df_composition/ df_structure will be empty an empty dataframe would be produced because of the join.
One can test with singleton featurizers:

from modnet.featurizers.presets import MatminerAll2023Featurizer
from matminer.featurizers.structure import GlobalSymmetryFeatures
featurizer_all = MatminerAll2023Featurizer()
# Get the lists of featurizers
all_featurizers = list(featurizer_all.structure_featurizers)
all_comp = list(featurizer_all.composition_featurizers)
all_oxid = list(featurizer_all.oxid_composition_featurizers)
all_site = list(featurizer_all.site_featurizers)
# We'll collect our new featurizers here:
featurizer_singletons = []

# Generate instances with one structure featurizer (others empty)
for s in all_featurizers:
    new_f = MatminerAll2023Featurizer()
    new_f.structure_featurizers       = [s, GlobalSymmetryFeatures()]
    new_f.composition_featurizers     = []
    new_f.oxid_composition_featurizers = []
    new_f.site_featurizers            = []
    new_f.name = f"structure_{s.__class__.__name__}"
    featurizer_singletons.append(new_f)

# Generate instances with one composition featurizer (others empty)
for c in [all_comp[-1]]:
    new_f = MatminerAll2023Featurizer()
    new_f.structure_featurizers        = []
    new_f.composition_featurizers      = [c]
    new_f.oxid_composition_featurizers = []
    new_f.site_featurizers             = []
    new_f.name = f"composition_{c.__class__.__name__}"
    featurizer_singletons.append(new_f)

# Generate instances with one oxid composition featurizer (others empty)
for o in all_oxid:
    new_f = MatminerAll2023Featurizer()
    new_f.structure_featurizers        = []
    new_f.composition_featurizers      = []
    new_f.oxid_composition_featurizers = [o]
    new_f.site_featurizers             = []
    new_f.name = f"oxid_composition_{o.__class__.__name__}"
    featurizer_singletons.append(new_f)

# Generate instances with one site featurizer (others empty)
for st in all_site:
    new_f = MatminerAll2023Featurizer()
    new_f.structure_featurizers        = []
    new_f.composition_featurizers      = []
    new_f.oxid_composition_featurizers = []
    new_f.site_featurizers             = [st]
    new_f.name = f"site_{st.__class__.__name__}"
    featurizer_singletons.append(new_f)

print(f"Created {len(featurizer_singletons)} individual featurizer instances.")
for f in featurizer_singletons:
    print(f"Featurizer name: {f.name}")

Additionally, structure_featurizers isn't flexible because it requires GlobalSymmetryFeatures() for the mapping that is done for these features later on.
I think it is in our interest to allow for this kind of modularity.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions