Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for assessing their quality.
It supports

- all *CASP*/*CAPRI* metrics and more
- small molecules to huge protein complexes
- small molecules to huge protein or nucleic acid complexes
- easy extension with custom metrics
- a command line interface and a Python API

Expand Down
3 changes: 1 addition & 2 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,13 @@ Atom Matching

Miscellaneous
-------------

.. autosummary::
:toctree: apidoc
:caption: Miscellaneous

MoleculeType
sanitize
standardize
is_small_molecule
get_contact_residues
find_atoms_by_pattern
estimate_formal_charges
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pepp'r documentation
It supports

- all *CASP*/*CAPRI* metrics and more
- small molecules to huge protein complexes
- small molecules to huge protein or nucleic acid complexes
- easy extension with custom metrics
- a command line interface and a Python API

Expand Down
9 changes: 7 additions & 2 deletions docs/tutorial/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ In this case we select the best value of the three most confident poses.
! peppr tabulate $TMPDIR/peppr.pkl $TMPDIR/table.csv top3
! cat $TMPDIR/table.csv

.. note::

The term '*polymer*' is used throughout ``peppr`` to refer to protein or nucleic
acid chains.

Aggregating results over systems
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The other type of report is an aggregated value for each metric (and each selector)
Expand All @@ -147,8 +152,8 @@ system.
For each metric the mean and median value over all systems are reported.
For some metrics, such as ``rmsd`` and ``dockq``, there is also bins the values are
sorted into.
For example, ``CA-RMSD <2.0`` gives the percentage of systems with a *CA-RMSD* below
2.0 Å.
For example, ``backbone RMSD <2.0`` gives the percentage of systems with a
:math:`C_{\alpha}`-RMSD below 2.0 Å.

Increasing the accuracy
-----------------------
Expand Down
63 changes: 62 additions & 1 deletion src/peppr/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
__all__ = ["is_small_molecule", "standardize"]
__all__ = ["MoleculeType", "standardize"]


import functools
from enum import Enum, auto
import biotite.structure as struc
import biotite.structure.info as info

DONOR_PATTERN = (
"["
Expand Down Expand Up @@ -37,6 +40,48 @@
HALOGEN_DISTANCE_SCALING = (0.8, 1.05)


class MoleculeType(Enum):
"""
The type of a molecule.
"""

SMALL_MOLECULE = auto()
PROTEIN = auto()
NUCLEIC_ACID = auto()

@staticmethod
def of(chain: struc.AtomArray) -> "MoleculeType":
"""
Determine the :class:`MoleculeType` of the given chain.

Parameters
----------
chain : struc.AtomArray, shape=(n,)
The chain to determine the :class:`MoleculeType` of.

Returns
-------
MoleculeType
The :class:`MoleculeType` of the given chain.

Warnings
--------
This function does not check if `chain` is truly a single chain, this is the
responsibility of the caller.
"""
if chain.hetero[0].item():
return MoleculeType.SMALL_MOLECULE
res_name = chain.res_name[0].item()
if res_name in _amino_acid_names():
return MoleculeType.PROTEIN
if res_name in _nucleotide_names():
return MoleculeType.NUCLEIC_ACID
raise ValueError(
f"Chain contains residue '{res_name}' which is not polymer component, "
"but it is also not marked as a small molecule"
)


def is_small_molecule(chain: struc.AtomArray) -> bool:
"""
Check whether the given chain is a small molecule.
Expand Down Expand Up @@ -86,3 +131,19 @@ def standardize(
if remove_monoatomic_ions:
mask &= ~struc.filter_monoatomic_ions(system)
return system[..., mask]


@functools.cache
def _amino_acid_names() -> set[str]:
"""
Get the names of all residues considered to be amino acids.
"""
return set(info.amino_acid_names())


@functools.cache
def _nucleotide_names() -> set[str]:
"""
Get the names of all residues considered to be nucleotides.
"""
return set(info.nucleotide_names())
10 changes: 5 additions & 5 deletions src/peppr/contacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def find_stacking_interactions(
return [
self._map_stacking_indices(combined_atoms, indices_1, indices_2, kind)
for indices_1, indices_2, kind in all_interactions
# filter out intra protein and intra ligand interactions
# filter out intra polymer and intra ligand interactions
if combined_atoms.hetero[indices_1[0]]
!= combined_atoms.hetero[indices_2[0]]
]
Expand All @@ -388,13 +388,13 @@ def _map_stacking_indices(
kind: struc.PiStacking,
) -> tuple[NDArray[np.int_], NDArray[np.int_], struc.PiStacking]:
"""Map combined structure indices back to original receptor and ligand."""
# Sort to ensure protein indices comes first then ligand
protein_idx, ligand_idx = sorted(
# Sort to ensure polymer indices comes first then ligand
polymer_idx, ligand_idx = sorted(
[indices_1, indices_2], key=lambda idx: combined_atoms.hetero[idx[0]]
)

return (
self._binding_site_indices[protein_idx], # Map to full receptor
self._binding_site_indices[polymer_idx], # Map to full receptor
ligand_idx - len(self._binding_site), # Map to ligand
kind,
)
Expand Down Expand Up @@ -438,7 +438,7 @@ def find_pi_cation_interactions(
return [
self._map_pi_cation_indices(ring_indices, cation_index)
for ring_indices, cation_index in all_interactions
# filter out intra protein and intra ligand interactions
# filter out intra polymer and intra ligand interactions
if (ring_indices[0] >= binding_site_size)
!= (cation_index >= binding_site_size)
]
Expand Down
74 changes: 45 additions & 29 deletions src/peppr/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,22 @@
SanitizeFlags,
SanitizeMol,
)
from peppr.common import is_small_molecule
from peppr.common import MoleculeType

# To match atoms between pose and reference chain,
# the residue and atom name are sufficient to unambiguously identify an atom
_ANNOTATIONS_FOR_ATOM_MATCHING = ["res_name", "atom_name"]
_IDENTITY_MATRIX = align.SubstitutionMatrix(
seq.ProteinSequence.alphabet,
seq.ProteinSequence.alphabet,
np.eye(len(seq.ProteinSequence.alphabet), dtype=np.int32),
)
_IDENTITY_MATRIX = {
molecule_type: align.SubstitutionMatrix(
alphabet,
alphabet,
np.eye(len(alphabet), dtype=np.int32),
)
for molecule_type, alphabet in (
(MoleculeType.PROTEIN, seq.ProteinSequence.alphabet),
(MoleculeType.NUCLEIC_ACID, seq.NucleotideSequence.alphabet_amb),
)
}
_PADDING = -1
_UNMAPPABLE_ENTITY_ID = -1

Expand Down Expand Up @@ -610,7 +616,10 @@ def _all_global_mappings(
list[tuple[struc.AtomArray, struc.AtomArray]]
] = []
for ref_chain_i, pose_chain_i in zip(ref_chain_indices, pose_chain_indices):
if is_small_molecule(reference_chains[ref_chain_i]):
if (
MoleculeType.of(reference_chains[ref_chain_i])
== MoleculeType.SMALL_MOLECULE
):
pose_mappings = _molecule_mappings(
reference_chains[ref_chain_i], pose_chains[pose_chain_i]
)
Expand Down Expand Up @@ -761,7 +770,7 @@ def _match_using_chain_order(
matched_reference_chains = []
matched_pose_chains = []
for ref_i, pose_i in zip(reference_chain_order, pose_chain_order):
if is_small_molecule(reference_chains[ref_i]):
if MoleculeType.of(reference_chains[ref_i]) == MoleculeType.SMALL_MOLECULE:
ref_atom_order, pose_atom_order = _find_optimal_molecule_permutation(
reference_chains[ref_i],
pose_chains[pose_i],
Expand Down Expand Up @@ -800,7 +809,7 @@ def _match_common_residues(
reference: struc.AtomArray, pose: struc.AtomArray
) -> tuple[struc.AtomArray, struc.AtomArray]:
"""
Find common residues (and the common atoms) in two protein chains.
Find common residues (and the common atoms) in two polymer chains.

Parameters
----------
Expand All @@ -825,7 +834,7 @@ def _match_common_residues(
alignment = align.align_optimal(
reference_sequence,
pose_sequence,
_IDENTITY_MATRIX,
_IDENTITY_MATRIX[MoleculeType.of(reference)],
# We get mismatches due to cropping, not due to evolution
# -> linear gap penalty makes most sense
gap_penalty=-1,
Expand Down Expand Up @@ -1182,24 +1191,28 @@ def _assign_entity_ids_to_chains(
entity_ids.append(chain.entity_id[0].item())
return np.array(entity_ids, dtype=int)

molecule_types = [MoleculeType.of(chain) for chain in chains]
sequences = [
struc.to_sequence(chain)[0][0] if not is_small_molecule(chain) else None
for chain in chains
struc.to_sequence(chain)[0][0]
if molecule_type != MoleculeType.SMALL_MOLECULE
else None
for chain, molecule_type in zip(chains, molecule_types)
]
if use_structure_match:
molecules = [
_to_mol(chain) if is_small_molecule(chain) else None for chain in chains
_to_mol(chain) if molecule_type == MoleculeType.SMALL_MOLECULE else None
for chain, molecule_type in zip(chains, molecule_types)
]

current_entity_id = 0
for i, (chain, sequence) in enumerate(zip(chains, sequences)):
for i, (chain, sequence, molecule_type) in enumerate(
zip(chains, sequences, molecule_types)
):
for j in range(i):
if (sequence is None and sequences[j] is not None) or (
sequence is not None and sequences[j] is None
):
# Cannot match small molecules to polymer chains
if molecule_type != molecule_types[j]:
# Cannot match different molecule types to each other
continue
elif sequence is None:
elif molecule_type == MoleculeType.SMALL_MOLECULE:
# Small molecule case
if use_structure_match:
# It is only a complete structure match,
Expand All @@ -1222,7 +1235,7 @@ def _assign_entity_ids_to_chains(
alignment = align.align_optimal(
sequence,
sequences[j],
_IDENTITY_MATRIX,
_IDENTITY_MATRIX[molecule_type],
# We get mismatches due to experimental artifacts, not evolution
# -> linear gap penalty makes most sense
gap_penalty=-1,
Expand Down Expand Up @@ -1273,19 +1286,22 @@ def _all_anchor_combinations(
raise UnmappableEntityError(
"No chain in the pose can be mapped to a reference chain"
)
protein_chain_mask = np.array(
[not is_small_molecule(chain) for chain in pose_chains]
polymer_chain_mask = np.array(
[
not MoleculeType.of(chain) != MoleculeType.SMALL_MOLECULE
for chain in pose_chains
]
)
valid_anchor_mask = protein_chain_mask & mappable_mask
valid_anchor_mask = polymer_chain_mask & mappable_mask
if not valid_anchor_mask.any():
# No mappable protein chains
# No mappable polymer chains
# -> Simply use the first mappable small molecule as anchor
anchor_entity_id = pose_entity_ids[np.where(mappable_mask)[0][0]]
else:
valid_anchor_indices = np.where(valid_anchor_mask)[0]
protein_entity_ids = pose_entity_ids[valid_anchor_indices]
multiplicities_of_entity_ids = np.bincount(protein_entity_ids)
multiplicities = multiplicities_of_entity_ids[protein_entity_ids]
polymer_entity_ids = pose_entity_ids[valid_anchor_indices]
multiplicities_of_entity_ids = np.bincount(polymer_entity_ids)
multiplicities = multiplicities_of_entity_ids[polymer_entity_ids]
least_multiplicity_indices = np.where(multiplicities == np.min(multiplicities))[
0
]
Expand Down Expand Up @@ -1335,7 +1351,7 @@ def _get_superimposition_transform(
transform : AffineTransformation
The transformation that superimposes the pose chain onto the reference chain.
"""
if is_small_molecule(reference_chain):
if MoleculeType.of(reference_chain) == MoleculeType.SMALL_MOLECULE:
if reference_chain.array_length() == pose_chain.array_length():
_, transform = struc.superimpose(reference_chain, pose_chain)
else:
Expand All @@ -1351,7 +1367,7 @@ def _get_superimposition_transform(
_, transform, _, _ = struc.superimpose_homologs(
reference_chain,
pose_chain,
_IDENTITY_MATRIX,
_IDENTITY_MATRIX[MoleculeType.of(reference_chain)],
gap_penalty=-1,
min_anchors=1,
# No outlier removal
Expand Down
Loading