diff --git a/src/openfe_analysis/reader.py b/src/openfe_analysis/reader.py index 83cca1c..cf7aaea 100644 --- a/src/openfe_analysis/reader.py +++ b/src/openfe_analysis/reader.py @@ -13,17 +13,18 @@ def _determine_iteration_dt(dataset) -> float: """ - Find out the timestep between each frame in the trajectory. + Determine the time increment between successive iterations + in a MultiStateReporter trajectory. Parameters ---------- dataset : nc.Dataset - Dataset holding the MultiStateReporter generated NetCDF file. + NetCDF dataset produced by ``openmmtools.multistate.MultiStateReporter``. Returns ------- float - The timestep in units of picoseconds. + The time between successive iterations, in picoseconds. Raises ------ @@ -35,7 +36,8 @@ def _determine_iteration_dt(dataset) -> float: ----- This assumes an MCMC move which serializes in a manner similar to `openmmtools.mcmc.LangevinDynamicsMove`, i.e. it must have - both a `timestep` and `n_steps` defined. + both a `timestep` and `n_steps` defined, such that + dt_iteration = n_steps * timestep """ # Deserialize the MCMC move information for the 0th entry. mcmc_move_data = yaml.load( @@ -149,16 +151,31 @@ def index_method(self) -> str: @staticmethod def parse_n_atoms(filename, **kwargs) -> int: + """ + Determine the number of atoms stored in a MultiStateReporter NetCDF file. + + Parameters + ---------- + filename : path-like + Path to the NetCDF file. + + Returns + ------- + int + Number of atoms in the system. + """ with nc.Dataset(filename) as ds: n_atoms = ds.dimensions["atom"].size return n_atoms def _read_next_timestep(self, ts=None) -> Timestep: + # Advance the trajectory by one frame. if (self._frame_index + 1) >= len(self): raise EOFError return self._read_frame(self._frame_index + 1) def _read_frame(self, frame: int) -> Timestep: + # Read a single trajectory frame. self._frame_index = frame frame = self._frames[self._frame_index] @@ -197,6 +214,7 @@ def _read_frame(self, frame: int) -> Timestep: @property def dt(self) -> float: + # Time difference between successive trajectory frames. return self._dt def _reopen(self): diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 2f5774a..1551880 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -16,22 +16,48 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Universe: - """Makes a Universe and applies some transformations + """ + Construct an MDAnalysis Universe from a MultiState NetCDF trajectory + and apply standard analysis transformations. + + The Universe is created using the custom ``FEReader`` to extract a + single state from a multistate simulation. + + Parameters + ---------- + top : pathlib.Path or Topology + Path to a topology file (e.g. PDB) or an already-loaded MDAnalysis + topology object. + trj : nc.Dataset + Open NetCDF dataset produced by + ``openmmtools.multistate.MultiStateReporter``. + state : int + Thermodynamic state index to extract from the multistate trajectory. + + Returns + ------- + MDAnalysis.Universe + A Universe with trajectory transformations applied. + Notes + ----- Identifies two AtomGroups: - protein, defined as having standard amino acid names, then filtered down to CA - ligand, defined as resname UNK - Then applies some transformations. + Depending on whether a protein is present, a sequence of trajectory + transformations is applied: If a protein is present: - prevents the protein from jumping between periodic images - - moves the ligand to the image closest to the protein - - aligns the entire system to minimise the protein RMSD + (class:`NoJump`) + - moves the ligand to the image closest to the protein (:class:`Minimiser`) + - aligns the entire system to minimise the protein RMSD (:class:`Aligner`) - If only a ligand: + If only a ligand is present: - prevents the ligand from jumping between periodic images + - Aligns the ligand to minimize its RMSD """ u = mda.Universe( top, @@ -76,23 +102,39 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers def gather_rms_data( pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None ) -> dict[str, list[float]]: - """Generate structural analysis of RBFE simulation + """ + Compute structural RMSD-based metrics for a multistate BFE simulation. Parameters ---------- pdb_topology : pathlib.Path - path to pdb topology + Path to the PDB file defining system topology. dataset : pathlib.Path - path to nc trajectory + Path to the NetCDF trajectory file produced by a multistate simulation. skip : int, optional - step at which to progress through the trajectory. by default, selects a - step that produces roughly 500 frames of analysis per replicate - - Produces, for each lambda state: - - 1D protein RMSD timeseries 'protein_RMSD' - - ligand RMSD timeseries - - ligand COM motion 'ligand_wander' - - 2D protein RMSD plot + Frame stride for analysis. If ``None``, a stride is chosen such that + approximately 500 frames are analyzed per state. + + Returns + ------- + dict[str, list] + Dictionary containing per-state analysis results with keys: + ``protein_RMSD``, ``ligand_RMSD``, ``ligand_wander``, + ``protein_2D_RMSD``, and ``time(ps)``. + + Notes + ----- + For each thermodynamic state (lambda), this function: + - Loads the trajectory using ``FEReader`` + - Applies standard PBC-handling and alignment transformations + - Computes protein and ligand structural metrics over time + + The following analyses are produced per state: + - 1D protein CA RMSD time series + - 1D ligand RMSD time series + - Ligand center-of-mass displacement from its initial position + (``ligand_wander``) + - Flattened 2D protein RMSD matrix (pairwise RMSD between frames) """ output = { "protein_RMSD": [], @@ -187,19 +229,25 @@ def gather_rms_data( def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]: - """2 dimensions RMSD + """ + Compute a flattened 2D RMSD matrix from a trajectory. + + For all unique frame pairs ``(i, j)`` with ``i < j``, this function + computes the RMSD between atomic coordinates after optimal alignment. Parameters ---------- positions : np.ndarray - the protein positions for the entire trajectory + Atomic coordinates for all frames in the trajectory. w : np.ndarray, optional - weights array + Per-atom weights to use in the RMSD calculation. If ``None``, + all atoms are weighted equally. Returns ------- - rmsd_matrix : list - a flattened version of the 2d + list of float + Flattened list of RMSD values corresponding to all frame pairs + ``(i, j)`` with ``i < j``. """ nframes, _, _ = positions.shape diff --git a/src/openfe_analysis/transformations.py b/src/openfe_analysis/transformations.py index 3b057e3..95febae 100644 --- a/src/openfe_analysis/transformations.py +++ b/src/openfe_analysis/transformations.py @@ -14,11 +14,26 @@ class NoJump(TransformationBase): - """Stops an AtomGroup from moving more than half a box length between frames - - This transformation prevents an AtomGroup "teleporting" across the box - border between two subsequent frames. This then simplifies the calculation - of motion over time. + """ + Prevent an AtomGroup from jumping between periodic images. + This transformation removes large apparent + COM displacements caused by periodic boundary conditions. + + Parameters + ---------- + ag : MDAnalysis.AtomGroup + AtomGroup whose center-of-mass motion should be made continuous. + + Notes + ----- + - This transformation assumes an orthorhombic unit cell. + - Only translations are applied; no rotations or scaling. + - The correction is based on center-of-mass motion and is therefore + most appropriate for compact groups (e.g. proteins, ligands). + - Must be applied before any alignment transformations to avoid + mixing reference frames. + - Is intended to be applied before analyses that rely on smooth + time evolution (e.g. RMSD, COM motion). """ ag: mda.AtomGroup @@ -49,12 +64,13 @@ class ClosestImageShift(TransformationBase): """ PBC-safe transformation that shifts one or more target AtomGroups so that their COM is in the closest image relative to a reference AtomGroup. - Works for any box type (triclinic or orthorhombic). - CAVEAT: - This Transformation requires the AtomGroups to be unwrapped! + CAVEAT: This Transformation requires the AtomGroups to be unwrapped! - Inspired from: + Notes + ----- + - Works for any box type (triclinic or orthorhombic). + - Inspired from: https://github.com/wolberlab/OpenMMDL/blob/main/openmmdl/openmmdl_simulation/scripts/post_md_conversions.py """ @@ -75,10 +91,15 @@ def _transform(self, ts): class Aligner(TransformationBase): - """On-the-fly transformation to align a trajectory to minimise RMSD - - centers all coordinates onto origin - rotates **entire universe** to minimise rmsd relative to **ref_ag** + """ + Align a trajectory to a reference AtomGroup by minimizing RMSD. + + Notes + ----- + Performs an on-the-fly least-squares alignment + of the entire universe to a reference AtomGroup. + At each frame, the coordinates are translated and rotated to minimize the + RMSD of the atoms relative to their positions in the reference. """ ref_pos: npt.NDArray diff --git a/src/openfe_analysis/utils/multistate.py b/src/openfe_analysis/utils/multistate.py index 3816889..672e71f 100644 --- a/src/openfe_analysis/utils/multistate.py +++ b/src/openfe_analysis/utils/multistate.py @@ -24,6 +24,11 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray: indices : NDArray[int] An ordered array of iteration indices which hold positions. + Raises + ------ + ValueError + If positions are not written at a consistent interval. + Note ---- This assumes that the indices are equally spaced by a given @@ -55,11 +60,12 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray: def _state_to_replica(dataset: nc.Dataset, state_num: int, frame_num: int) -> int: - """Convert a state index to replica index at a given Dataset frame + """ + Map a thermodynamic state index to the corresponding replica index. Parameters ---------- - dataset : netCDF4.Dataset + dataset : nc.Dataset Dataset containing the MultiState reporter generated NetCDF file with information about all the frames and replica in the system. state_num : int @@ -86,7 +92,7 @@ def _replica_positions_at_frame( Parameters ---------- - dataset : netCDF4.Dataset + dataset : nc.Dataset Dataset containing the MultiState information. replica_index : int Replica index to extract positions for. @@ -125,7 +131,7 @@ def _create_new_dataset(filename: Path, n_atoms: int, title: str) -> nc.Dataset: Returns ------- - netCDF4.Dataset + nc.Dataset AMBER Conventions compliant NetCDF dataset to store information contained in MultiState reporter generated NetCDF file. """ @@ -170,13 +176,13 @@ def _get_unitcell( dataset: nc.Dataset, replica_index: int, frame_num: int ) -> Optional[Tuple[unit.Quantity]]: """ - Helper method to extract a unit cell from the stored + Helper method to extract unit cell dimensions from the stored box vectors in a MultiState reporter generated NetCDF file at a given state and Dataset frame. Parameters ---------- - dataset : netCDF4.Dataset + dataset : nc.Dataset Dataset of MultiState reporter generated NetCDF file. replica_index : int Replica for which to get the unit cell for.