Skip to content

The pyMBE dataframe is monolithic #146

@pm-blanco

Description

@pm-blanco

Our current dataframe structure is highly heterogeneous, which complicates data management, reduces accessibility, and makes the implementation of new features more difficult. Since data in pyMBE is conceptually grouped by pmb_type, it seems natural to organize it through a hierarchical, type-based dataframe structure managed through a unified controller class.

Below is a minimal working draft of how such a system could be structured within the codebase.

Proposed API Design

High-level layout

pyMBE/
└── storage/
    ├── df_management.py         → _DFManagement
    ├── base_type.py       → PMBBaseModel 
    ├── types/
    │    ├── particle.py   → class Particle
    │    ├── particle_state.py   → class ParticleState
    │    ├── lj.py         → class LennardJones
    │    ├── bond.py       → ...
    └── io.py              → Save/load utilities

Base Object: PMBBaseModel

from pydantic import BaseModel, Field
from typing import Optional, Dict, Any

class PMBBaseModel(BaseModel):
    name: str
    pmb_type: str

    class Config:
        validate_assignment = True
        extra = "forbid"  # disallow ghost fields

Example Type Models

class ParticleState(PMBBaseModel):
    label: str
    es_type: str
    charge: float

class Particle(PMBBaseModel):
    pmb_type: str = Field(default="particle", frozen=True)
    name: str
    sigma: QuantityModel
    epsilon: QuantityModel
    states: Dict[str, ParticleState] = Field(default_factory=dict)
    default_state: Optional[str] = None

    @field_validator("default_state")
    def validate_default(cls, v, values):
        if v is not None and v not in values["states"]:
            raise ValueError(f"default_state '{v}' not in states: {list(values['states'])}")
        return v

    def add_state(self, state: ParticleState):
        if state.label in self.states:
            raise ValueError(f"Duplicate state label '{state.label}'")
        self.states[state.label] = state

    def get_state(self, label=None) -> ParticleState:
        label = label or self.default_state
        if label not in self.states:
            raise KeyError(f"State '{label}' not found")
        return self.states[label]

class LennardJones(PMBBaseModel):
    pmb_type: str = "LennardJones"
    parameters_of_the_potential: Optional[dict] = None
    

DatabaseManager API

import pandas as pd
from typing import Dict, Type, List, Callable

class _DFManagement:
    def __init__(self):
        self.tables: Dict[str, pd.DataFrame] = {}
        self.schemas: Dict[str, Type[PMBBaseModel]] = {}

    def register_type(self, model: Type[PMBBaseModel]):
        self.schemas[model.__fields__['pmb_type'].default] = model
        if model.__fields__['pmb_type'].default not in self.tables:
            self.tables[model.__fields__['pmb_type'].default] = pd.DataFrame()

    def set(self, pmb_type: str, **kwargs) -> PMBBaseModel:
        model = self.schemas[pmb_type](**kwargs)
        df = self.tables[pmb_type]
        self.tables[pmb_type] = pd.concat([df, pd.DataFrame([model.dict()])], ignore_index=True)
        return model

    def get(self, pmb_type: str) -> pd.DataFrame:
        return self.tables[pmb_type].copy()

    def query(self, pmb_type: str, fn: Callable[[pd.DataFrame], pd.DataFrame]):
        return fn(self.tables[pmb_type].copy())

    def update(self, pmb_type: str, index: int, **kwargs):
        model = self.schemas[pmb_type](**{**self.tables[pmb_type].iloc[index].to_dict(), **kwargs})
        self.tables[pmb_type].loc[index] = model.dict()

    def delete(self, pmb_type: str, index: int):
        self.tables[pmb_type] = self.tables[pmb_type].drop(index).reset_index(drop=True)

    def export(self,  folder="pmb_df"):
        # one file per type
        for p_type, df in self.tables.items():
            path = f"{folder}/{p_type}.csv"
            df.to_csv(path, index=False)

Example usage in pyMBE.py

from pyMBE.storage.df_management import _DFManagement as _DFm
from storage.types.particle import Particle
from storage.types.lennard_jones import LennardJones

class pymbe_library():

from pyMBE.storage.df_management import _DFManagement as _DFm

    def __init__(self):
        # Create and configure the database
        self._DFm = _DFManagement()

        # Register available pmb_types here
        self._DFm.register_type(Particle)
        self._DFm.register_type(LennardJones)

     def particle(...):
                _DFm.set( "particle",  sigma=sigma, epsilon=epsilon, 
                                  states={ "protonated": ParticleState(label="protonated", es_type=0, charge=0),
                                                 "deprotonated": ParticleState(label="deprotonated", es_type=1, charge=-1)
                                  },
    default_state="protonated",
  

Metadata

Metadata

Assignees

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions