Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions src/atomworks/io/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def parse(
build_assembly: Literal["first", "all"] | list[str] | tuple[str] | None = "all",
extra_fields: list[str] | Literal["all"] | None = None,
keep_cif_block: bool = False,
altloc: Literal["first", "occupancy", "all"] | str = "first",
) -> dict[str, Any]:
"""Entrypoint for general parsing of atomic-level structure files.

Expand Down Expand Up @@ -204,6 +205,9 @@ def parse(
extra_fields (list, optional): A list of extra fields to include in the AtomArrayStack. Defaults to None. "all" includes all fields.
Only supports mmCIF files.
keep_cif_block (bool, optional): Whether to keep the CIF block in the result. Defaults to False.
altloc (Literal['first', 'occupancy', 'all'] | str, optional): How to handle alternate location indicators.
Options are 'first' (keep the first altloc), 'occupancy' (keep the altloc with highest occupancy),
'all' (keep all altlocs), or a specific altloc identifier (e.g., 'A' to keep only altloc 'A'). Defaults to 'first'.

Returns:
dict: A dictionary containing the following keys:
Expand Down Expand Up @@ -251,6 +255,20 @@ def parse(
"after adding inter-residue bonds. To avoid this and fix formal charges, set `add_missing_atoms = True`."
)

if altloc == "all" and add_missing_atoms:
raise ValueError(
"altloc='all' is not compatible with add_missing_atoms=True. "
"Template matching requires unique atom names per residue, which is not guaranteed with multiple altlocs. "
"Use altloc='first' (default) with add_missing_atoms=True, or set add_missing_atoms=False to preserve all altlocs."
)

if altloc == "all" and fix_bond_types:
logger.warning(
"altloc='all' with fix_bond_types=True may produce incorrect bond corrections. "
"Multiple altlocs cause atoms to appear with higher degree than expected, triggering spurious corrections. "
"Consider using fix_bond_types=False when preserving all altlocs."
)

file_type = file_type or infer_pdb_file_type(filename)
is_buffer = isinstance(filename, io.StringIO | io.BytesIO)

Expand Down Expand Up @@ -328,6 +346,7 @@ def parse(
model=model,
build_assembly=build_assembly,
extra_fields=extra_fields,
altloc=altloc,
)
elif file_type in ("cif", "bcif"):
result = _parse_from_cif(
Expand All @@ -348,6 +367,7 @@ def parse(
build_assembly=build_assembly,
extra_fields=extra_fields,
keep_cif_block=keep_cif_block,
altloc=altloc,
)
else:
raise ValueError(f"Unsupported file type: {filename}")
Expand Down Expand Up @@ -636,9 +656,9 @@ def parse_atom_array(

# ... build assemblies and add assembly-specific annotations (instance IDs like `chain_iid`, `pn_unit_iid`, `molecule_iid`)
if exists(build_assembly):
assert build_assembly in ["first", "all"] or isinstance(
build_assembly, list | tuple
), "Invalid `build_assembly` option. Must be 'first', 'all', or a list/tuple of assembly IDs as strings."
assert build_assembly in ["first", "all"] or isinstance(build_assembly, list | tuple), (
"Invalid `build_assembly` option. Must be 'first', 'all', or a list/tuple of assembly IDs as strings."
)

# Determine assembly categories: use CIF data if build_assembly is set, otherwise identity operations
if exists(build_assembly) and exists(_cif_file) and "pdbx_struct_assembly" in data_dict["cif_block"]:
Expand Down Expand Up @@ -729,6 +749,7 @@ def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs)
model=kwargs["model"],
add_bond_types_from_struct_conn=kwargs["add_bond_types_from_struct_conn"],
fix_bond_types=kwargs["fix_bond_types"],
altloc=kwargs.get("altloc", "first"),
)
except InvalidFileError:
logger.info("Invalid file error encountered; loading with only one model")
Expand All @@ -739,10 +760,11 @@ def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs)
model=1,
add_bond_types_from_struct_conn=kwargs["add_bond_types_from_struct_conn"],
fix_bond_types=kwargs["fix_bond_types"],
altloc=kwargs.get("altloc", "first"),
)

# process the asym_unit_stack according to the given keyword arguments
kwargs_to_pass = {k: v for k, v in kwargs.items() if k not in ["model", "file_type", "keep_cif_block"]}
kwargs_to_pass = {k: v for k, v in kwargs.items() if k not in ["model", "file_type", "keep_cif_block", "altloc"]}
data_dict = parse_atom_array(asym_unit_stack, data_dict=data_dict, _cif_file=cif_file, **kwargs_to_pass)

# Extract the asym_unit_stack from the returned data_dict
Expand Down Expand Up @@ -785,7 +807,7 @@ def _parse_from_pdb(filename: os.PathLike, **parse_from_cif_kwargs) -> dict[str,
pdb_file = read_any(filename)
atom_array_stack = pdb_file.get_structure(
model=parse_from_cif_kwargs["model"],
altloc="first",
altloc=parse_from_cif_kwargs.get("altloc", "first"),
extra_fields=["b_factor", "occupancy", "charge", "atom_id"],
include_bonds=True,
)
Expand Down Expand Up @@ -822,7 +844,7 @@ def _parse_from_pdb(filename: os.PathLike, **parse_from_cif_kwargs) -> dict[str,
# PDB files use identity assembly, so "all" builds just the single identity assembly
parse_from_cif_kwargs["build_assembly"] = "all"

kwargs_to_pass = {k: v for k, v in parse_from_cif_kwargs.items() if k not in ["model", "file_type"]}
kwargs_to_pass = {k: v for k, v in parse_from_cif_kwargs.items() if k not in ["model", "file_type", "altloc"]}
data_dict = parse_atom_array(atom_array_stack, _cif_file=None, **kwargs_to_pass)
data_dict["metadata"]["id"] = Path(filename).stem.lower()

Expand Down
5 changes: 4 additions & 1 deletion src/atomworks/io/utils/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ def load_any(
If "all", all fields in the 'atom_site' category of the file will be included.
include_bonds: Whether to include bonds in the structure.
model: The model number to use for loading the structure. If None, all models will be loaded.
altloc: The altloc ID to use for loading the structure.
altloc: The altloc ID to use for loading the structure. If "first", the first altloc will be used.
If "occupancy", the altloc with highest occupancy for each atom will be used. If "all", all altlocs will be included.
If a string is provided, it will be used as the altloc ID to filter the structure by and it is assumed
that that altloc ID is present in the file. If it is not present, an error will be raised. Defaults to "occupancy".

Returns:
The loaded structure with the specified fields and assumptions.
Expand Down
Loading