From 6a461871fe61a4fd1e5651624d17528f3343b987 Mon Sep 17 00:00:00 2001 From: Karson Chrispens Date: Wed, 17 Dec 2025 20:21:16 -0800 Subject: [PATCH] fix(parser): parser exposes altloc argument Altlocs are now grabbed when in and --- src/atomworks/io/parser.py | 34 ++++++++++++++++++++++++------ src/atomworks/io/utils/io_utils.py | 5 ++++- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/atomworks/io/parser.py b/src/atomworks/io/parser.py index e539d353..d5833b81 100644 --- a/src/atomworks/io/parser.py +++ b/src/atomworks/io/parser.py @@ -145,6 +145,7 @@ def parse( build_assembly: Literal["first", "all"] | list[str] | tuple[str] | None = "all", extra_fields: list[str] | Literal["all"] | None = None, keep_cif_block: bool = False, + altloc: Literal["first", "occupancy", "all"] | str = "first", ) -> dict[str, Any]: """Entrypoint for general parsing of atomic-level structure files. @@ -204,6 +205,9 @@ def parse( extra_fields (list, optional): A list of extra fields to include in the AtomArrayStack. Defaults to None. "all" includes all fields. Only supports mmCIF files. keep_cif_block (bool, optional): Whether to keep the CIF block in the result. Defaults to False. + altloc (Literal['first', 'occupancy', 'all'] | str, optional): How to handle alternate location indicators. + Options are 'first' (keep the first altloc), 'occupancy' (keep the altloc with highest occupancy), + 'all' (keep all altlocs), or a specific altloc identifier (e.g., 'A' to keep only altloc 'A'). Defaults to 'first'. Returns: dict: A dictionary containing the following keys: @@ -251,6 +255,20 @@ def parse( "after adding inter-residue bonds. To avoid this and fix formal charges, set `add_missing_atoms = True`." ) + if altloc == "all" and add_missing_atoms: + raise ValueError( + "altloc='all' is not compatible with add_missing_atoms=True. " + "Template matching requires unique atom names per residue, which is not guaranteed with multiple altlocs. " + "Use altloc='first' (default) with add_missing_atoms=True, or set add_missing_atoms=False to preserve all altlocs." + ) + + if altloc == "all" and fix_bond_types: + logger.warning( + "altloc='all' with fix_bond_types=True may produce incorrect bond corrections. " + "Multiple altlocs cause atoms to appear with higher degree than expected, triggering spurious corrections. " + "Consider using fix_bond_types=False when preserving all altlocs." + ) + file_type = file_type or infer_pdb_file_type(filename) is_buffer = isinstance(filename, io.StringIO | io.BytesIO) @@ -328,6 +346,7 @@ def parse( model=model, build_assembly=build_assembly, extra_fields=extra_fields, + altloc=altloc, ) elif file_type in ("cif", "bcif"): result = _parse_from_cif( @@ -348,6 +367,7 @@ def parse( build_assembly=build_assembly, extra_fields=extra_fields, keep_cif_block=keep_cif_block, + altloc=altloc, ) else: raise ValueError(f"Unsupported file type: {filename}") @@ -636,9 +656,9 @@ def parse_atom_array( # ... build assemblies and add assembly-specific annotations (instance IDs like `chain_iid`, `pn_unit_iid`, `molecule_iid`) if exists(build_assembly): - assert build_assembly in ["first", "all"] or isinstance( - build_assembly, list | tuple - ), "Invalid `build_assembly` option. Must be 'first', 'all', or a list/tuple of assembly IDs as strings." + assert build_assembly in ["first", "all"] or isinstance(build_assembly, list | tuple), ( + "Invalid `build_assembly` option. Must be 'first', 'all', or a list/tuple of assembly IDs as strings." + ) # Determine assembly categories: use CIF data if build_assembly is set, otherwise identity operations if exists(build_assembly) and exists(_cif_file) and "pdbx_struct_assembly" in data_dict["cif_block"]: @@ -729,6 +749,7 @@ def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs) model=kwargs["model"], add_bond_types_from_struct_conn=kwargs["add_bond_types_from_struct_conn"], fix_bond_types=kwargs["fix_bond_types"], + altloc=kwargs.get("altloc", "first"), ) except InvalidFileError: logger.info("Invalid file error encountered; loading with only one model") @@ -739,10 +760,11 @@ def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs) model=1, add_bond_types_from_struct_conn=kwargs["add_bond_types_from_struct_conn"], fix_bond_types=kwargs["fix_bond_types"], + altloc=kwargs.get("altloc", "first"), ) # process the asym_unit_stack according to the given keyword arguments - kwargs_to_pass = {k: v for k, v in kwargs.items() if k not in ["model", "file_type", "keep_cif_block"]} + kwargs_to_pass = {k: v for k, v in kwargs.items() if k not in ["model", "file_type", "keep_cif_block", "altloc"]} data_dict = parse_atom_array(asym_unit_stack, data_dict=data_dict, _cif_file=cif_file, **kwargs_to_pass) # Extract the asym_unit_stack from the returned data_dict @@ -785,7 +807,7 @@ def _parse_from_pdb(filename: os.PathLike, **parse_from_cif_kwargs) -> dict[str, pdb_file = read_any(filename) atom_array_stack = pdb_file.get_structure( model=parse_from_cif_kwargs["model"], - altloc="first", + altloc=parse_from_cif_kwargs.get("altloc", "first"), extra_fields=["b_factor", "occupancy", "charge", "atom_id"], include_bonds=True, ) @@ -822,7 +844,7 @@ def _parse_from_pdb(filename: os.PathLike, **parse_from_cif_kwargs) -> dict[str, # PDB files use identity assembly, so "all" builds just the single identity assembly parse_from_cif_kwargs["build_assembly"] = "all" - kwargs_to_pass = {k: v for k, v in parse_from_cif_kwargs.items() if k not in ["model", "file_type"]} + kwargs_to_pass = {k: v for k, v in parse_from_cif_kwargs.items() if k not in ["model", "file_type", "altloc"]} data_dict = parse_atom_array(atom_array_stack, _cif_file=None, **kwargs_to_pass) data_dict["metadata"]["id"] = Path(filename).stem.lower() diff --git a/src/atomworks/io/utils/io_utils.py b/src/atomworks/io/utils/io_utils.py index 3817f2a0..56d2fd89 100644 --- a/src/atomworks/io/utils/io_utils.py +++ b/src/atomworks/io/utils/io_utils.py @@ -102,7 +102,10 @@ def load_any( If "all", all fields in the 'atom_site' category of the file will be included. include_bonds: Whether to include bonds in the structure. model: The model number to use for loading the structure. If None, all models will be loaded. - altloc: The altloc ID to use for loading the structure. + altloc: The altloc ID to use for loading the structure. If "first", the first altloc will be used. + If "occupancy", the altloc with highest occupancy for each atom will be used. If "all", all altlocs will be included. + If a string is provided, it will be used as the altloc ID to filter the structure by and it is assumed + that that altloc ID is present in the file. If it is not present, an error will be raised. Defaults to "occupancy". Returns: The loaded structure with the specified fields and assumptions.